From 883dbb9c86be87593a58ef10b070b3a0564c7fee Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 22 Mar 2023 15:43:46 +0100
Subject: [PATCH 001/208] Revert "[MemProf] Context disambiguation cloning pass
 [patch 1a/3]"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit d6ad4f01c3dafcab335bca66dac6e36d9eac8421.

Fails to build on at least gcc 12.2:

/home/npopov/repos/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp:482:1: error: no declaration matches ‘ContextNode<DerivedCCG, FuncTy, CallTy>* CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(const CallInfo&)’
  482 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
      | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/npopov/repos/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp:393:16: note: candidate is: ‘CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode* CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(const CallInfo&)’
  393 |   ContextNode *getNodeForInst(const CallInfo &C);
      |                ^~~~~~~~~~~~~~
/home/npopov/repos/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp:99:7: note: ‘class CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>’ defined here
   99 | class CallsiteContextGraph {
      |       ^~~~~~~~~~~~~~~~~~~~
---
 .../IPO/MemProfContextDisambiguation.h        |   38 -
 llvm/lib/Passes/PassBuilder.cpp               |    1 -
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   11 -
 llvm/lib/Passes/PassRegistry.def              |    1 -
 llvm/lib/Transforms/IPO/CMakeLists.txt        |    1 -
 .../IPO/MemProfContextDisambiguation.cpp      | 1583 -----------------
 llvm/test/ThinLTO/X86/memprof-summary.ll      |  184 ++
 .../MemProfContextDisambiguation/basic.ll     |  158 --
 .../duplicate-context-ids.ll                  |  232 ---
 .../duplicate-context-ids2.ll                 |  386 ----
 .../indirectcall.ll                           |  261 ---
 .../MemProfContextDisambiguation/inlined.ll   |  189 --
 .../MemProfContextDisambiguation/inlined2.ll  |  135 --
 .../pass-pipeline.ll                          |   41 -
 14 files changed, 184 insertions(+), 3037 deletions(-)
 delete mode 100644 llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
 delete mode 100644 llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
 create mode 100644 llvm/test/ThinLTO/X86/memprof-summary.ll
 delete mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
 delete mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
 delete mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids2.ll
 delete mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
 delete mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
 delete mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll
 delete mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/pass-pipeline.ll

diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
deleted file mode 100644
index 56e56ed67f7df..0000000000000
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//==- MemProfContextDisambiguation.h - Context Disambiguation ----*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implements support for context disambiguation of allocation calls for profile
-// guided heap optimization using memprof metadata. See implementation file for
-// details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
-#define LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-class Module;
-
-class MemProfContextDisambiguation
-    : public PassInfoMixin<MemProfContextDisambiguation> {
-  /// Run the context disambiguator on \p M, returns true if any changes made.
-  bool processModule(Module &M);
-
-public:
-  MemProfContextDisambiguation() {}
-
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index a04f8bbaa5dc0..89d2e6a4b2d1a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -117,7 +117,6 @@
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/LoopExtractor.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
-#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
 #include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index aaabe23049288..1d386139d9e6c 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -57,7 +57,6 @@
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 #include "llvm/Transforms/IPO/Inliner.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
-#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
 #include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
@@ -272,10 +271,6 @@ static cl::opt<AttributorRunOption> AttributorRun(
                clEnumValN(AttributorRunOption::NONE, "none",
                           "disable attributor runs")));
 
-cl::opt<bool> EnableMemProfContextDisambiguation(
-    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
-    cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
-
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -1714,12 +1709,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
       InlineContext{ThinOrFullLTOPhase::FullLTOPostLink,
                     InlinePass::CGSCCInliner}));
 
-  // Perform context disambiguation after inlining, since that would reduce the
-  // amount of additional cloning required to distinguish the allocation
-  // contexts.
-  if (EnableMemProfContextDisambiguation)
-    MPM.addPass(MemProfContextDisambiguation());
-
   // Optimize globals again after we ran the inliner.
   MPM.addPass(GlobalOptPass());
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 82592a1ee9b55..04d648580a8c5 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -87,7 +87,6 @@ MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
 MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
 MODULE_PASS("partial-inliner", PartialInlinerPass())
-MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
 MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
 MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
 MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index e03aff0f65d7a..063a9a60d0cb5 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -27,7 +27,6 @@ add_llvm_component_library(LLVMipo
   Internalize.cpp
   LoopExtractor.cpp
   LowerTypeTests.cpp
-  MemProfContextDisambiguation.cpp
   MergeFunctions.cpp
   ModuleInliner.cpp
   OpenMPOpt.cpp
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
deleted file mode 100644
index fc8b12df67822..0000000000000
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ /dev/null
@@ -1,1583 +0,0 @@
-//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements support for context disambiguation of allocation
-// calls for profile guided heap optimization. Specifically, it uses Memprof
-// profiles which indicate context specific allocation behavior (currently
-// distinguishing cold vs hot memory allocations). Cloning is performed to
-// expose the cold allocation call contexts, and the allocation calls are
-// subsequently annotated with an attribute for later transformation.
-//
-// The transformations can be performed either directly on IR (regular LTO), or
-// (eventually) on a ThinLTO index (later applied to the IR during the ThinLTO
-// backend). Both types of LTO operate on a the same base graph representation,
-// which uses CRTP to support either IR or Index formats.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/MemoryProfileInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include <sstream>
-#include <vector>
-using namespace llvm;
-using namespace llvm::memprof;
-
-#define DEBUG_TYPE "memprof-context-disambiguation"
-
-static cl::opt<std::string> DotFilePathPrefix(
-    "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
-    cl::value_desc("filename"),
-    cl::desc("Specify the path prefix of the MemProf dot files."));
-
-static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
-                                 cl::Hidden,
-                                 cl::desc("Export graph to dot files."));
-
-static cl::opt<bool>
-    DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
-            cl::desc("Dump CallingContextGraph to stdout after each stage."));
-
-static cl::opt<bool>
-    VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
-              cl::desc("Perform verification checks on CallingContextGraph."));
-
-static cl::opt<bool>
-    VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
-                cl::desc("Perform frequent verification checks on nodes."));
-
-inline bool hasSingleAllocType(uint8_t AllocTypes) {
-  switch (AllocTypes) {
-  case (uint8_t)AllocationType::Cold:
-  case (uint8_t)AllocationType::NotCold:
-    return true;
-    break;
-  case (uint8_t)AllocationType::None:
-    assert(false);
-    break;
-  default:
-    return false;
-    break;
-  }
-  llvm_unreachable("invalid alloc type");
-}
-
-/// CRTP base for graphs built from either IR or ThinLTO summary index.
-///
-/// The graph represents the call contexts in all memprof metadata on allocation
-/// calls, with nodes for the allocations themselves, as well as for the calls
-/// in each context. The graph is initially built from the allocation memprof
-/// metadata (or summary) MIBs. It is then updated to match calls with callsite
-/// metadata onto the nodes, updating it to reflect any inlining performed on
-/// those calls.
-///
-/// Each MIB (representing an allocation's call context with allocation
-/// behavior) is assigned a unique context id during the graph build. The edges
-/// and nodes in the graph are decorated with the context ids they carry. This
-/// is used to correctly update the graph when cloning is performed so that we
-/// can uniquify the context for a single (possibly cloned) allocation.
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-class CallsiteContextGraph {
-public:
-  CallsiteContextGraph() = default;
-  CallsiteContextGraph(const CallsiteContextGraph &) = default;
-  CallsiteContextGraph(CallsiteContextGraph &&) = default;
-
-  /// Main entry point to perform analysis and transformations on graph.
-  bool process();
-
-  void dump() const;
-  void print(raw_ostream &OS) const;
-
-  friend raw_ostream &operator<<(raw_ostream &OS,
-                                 const CallsiteContextGraph &CCG) {
-    CCG.print(OS);
-    return OS;
-  }
-
-  friend struct GraphTraits<
-      const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
-  friend struct DOTGraphTraits<
-      const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
-
-  void exportToDot(std::string Label) const;
-
-  /// Represents a function clone via FuncTy pointer and clone number pair.
-  struct FuncInfo final
-      : public std::pair<FuncTy *, unsigned /*Clone number*/> {
-    using Base = std::pair<FuncTy *, unsigned>;
-    FuncInfo(const Base &B) : Base(B) {}
-    FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
-    explicit operator bool() const { return this->first != nullptr; }
-    FuncTy *func() const { return this->first; }
-    unsigned cloneNo() const { return this->second; }
-  };
-
-  /// Represents a callsite clone via CallTy and clone number pair.
-  struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
-    using Base = std::pair<CallTy, unsigned>;
-    CallInfo(const Base &B) : Base(B) {}
-    CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
-        : Base(Call, CloneNo) {}
-    explicit operator bool() const { return (bool)this->first; }
-    CallTy call() const { return this->first; }
-    unsigned cloneNo() const { return this->second; }
-    void setCloneNo(unsigned N) { this->second = N; }
-    void print(raw_ostream &OS) const {
-      if (!operator bool()) {
-        assert(!cloneNo());
-        OS << "null Call";
-        return;
-      }
-      call()->print(OS);
-      OS << "\t(clone " << cloneNo() << ")";
-    }
-    void dump() const {
-      print(dbgs());
-      dbgs() << "\n";
-    }
-    friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
-      Call.print(OS);
-      return OS;
-    }
-  };
-
-  struct ContextEdge;
-
-  /// Node in the Callsite Context Graph
-  struct ContextNode {
-    // Keep this for now since in the IR case where we have an Instruction* it
-    // is not as immediately discoverable. Used for printing richer information
-    // when dumping graph.
-    bool IsAllocation;
-
-    // Keeps track of when the Call was reset to null because there was
-    // recursion.
-    bool Recursive = false;
-
-    // The corresponding allocation or interior call.
-    CallInfo Call;
-
-    // For alloc nodes this is a unique id assigned when constructed, and for
-    // callsite stack nodes it is the original stack id when the node is
-    // constructed from the memprof MIB metadata on the alloc nodes. Note that
-    // this is only used when matching callsite metadata onto the stack nodes
-    // created when processing the allocation memprof MIBs, and for labeling
-    // nodes in the dot graph. Therefore we don't bother to assign a value for
-    // clones.
-    uint64_t OrigStackOrAllocId = 0;
-
-    // This will be formed by ORing together the AllocationType enum values
-    // for contexts including this node.
-    uint8_t AllocTypes = 0;
-
-    // Edges to all callees in the profiled call stacks.
-    // TODO: Should this be a map (from Callee node) for more efficient lookup?
-    std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
-
-    // Edges to all callers in the profiled call stacks.
-    // TODO: Should this be a map (from Caller node) for more efficient lookup?
-    std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
-
-    // The set of IDs for contexts including this node.
-    DenseSet<uint32_t> ContextIds;
-
-    // List of clones of this ContextNode, initially empty.
-    std::vector<ContextNode *> Clones;
-
-    // If a clone, points to the original uncloned node.
-    ContextNode *CloneOf = nullptr;
-
-    ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
-
-    ContextNode(bool IsAllocation, CallInfo C)
-        : IsAllocation(IsAllocation), Call(C) {}
-
-    std::unique_ptr<ContextNode> clone() {
-      auto Clone = std::make_unique<ContextNode>(IsAllocation, Call);
-      if (CloneOf) {
-        CloneOf->Clones.push_back(Clone.get());
-        Clone->CloneOf = CloneOf;
-      } else {
-        Clones.push_back(Clone.get());
-        Clone->CloneOf = this;
-      }
-      return Clone;
-    }
-
-    ContextNode *getOrigNode() {
-      if (!CloneOf)
-        return this;
-      return CloneOf;
-    }
-
-    void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
-                               unsigned int ContextId);
-
-    ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
-    ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
-    void eraseCalleeEdge(const ContextEdge *Edge);
-    void eraseCallerEdge(const ContextEdge *Edge);
-
-    void setCall(CallInfo C) { Call = C; }
-
-    bool hasCall() const { return (bool)Call.call(); }
-
-    void printCall(raw_ostream &OS) const { Call.print(OS); }
-
-    // True if this node was effectively removed from the graph, in which case
-    // its context id set, caller edges, and callee edges should all be empty.
-    bool isRemoved() const {
-      assert(ContextIds.empty() ==
-             (CalleeEdges.empty() && CallerEdges.empty()));
-      return ContextIds.empty();
-    }
-
-    void dump() const;
-    void print(raw_ostream &OS) const;
-
-    friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
-      Node.print(OS);
-      return OS;
-    }
-  };
-
-  /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
-  /// callee.
-  struct ContextEdge {
-    ContextNode *Callee;
-    ContextNode *Caller;
-
-    // This will be formed by ORing together the AllocationType enum values
-    // for contexts including this edge.
-    uint8_t AllocTypes = 0;
-
-    // The set of IDs for contexts including this edge.
-    DenseSet<uint32_t> ContextIds;
-
-    ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
-                DenseSet<uint32_t> ContextIds)
-        : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
-          ContextIds(ContextIds) {}
-
-    DenseSet<uint32_t> &getContextIds() { return ContextIds; }
-
-    void dump() const;
-    void print(raw_ostream &OS) const;
-
-    friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
-      Edge.print(OS);
-      return OS;
-    }
-  };
-
-protected:
-  /// Get a list of nodes corresponding to the stack ids in the given callsite
-  /// context.
-  template <class NodeT, class IteratorT>
-  std::vector<uint64_t>
-  getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
-
-  /// Adds nodes for the given allocation and any stack ids on its memprof MIB
-  /// metadata (or summary).
-  ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
-
-  /// Adds nodes for the given MIB stack ids.
-  template <class NodeT, class IteratorT>
-  void addStackNodesForMIB(ContextNode *AllocNode,
-                           CallStack<NodeT, IteratorT> &StackContext,
-                           CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType);
-
-  /// Matches all callsite metadata (or summary) to the nodes created for
-  /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
-  /// inlining performed on those callsite instructions.
-  void updateStackNodes();
-
-  /// Update graph to conservatively handle any callsite stack nodes that target
-  /// multiple different callee target functions.
-  void handleCallsitesWithMultipleTargets();
-
-  /// Save lists of calls with MemProf metadata in each function, for faster
-  /// iteration.
-  std::vector<std::pair<FuncTy *, std::vector<CallInfo>>>
-      FuncToCallsWithMetadata;
-
-  /// Map from callsite node to the enclosing caller function.
-  std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
-
-private:
-  using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
-
-  using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
-                                     const FuncTy *, DenseSet<uint32_t>>;
-
-  /// Assigns the given Node to calls at or inlined into the location with
-  /// the Node's stack id, after post order traversing and processing its
-  /// caller nodes. Uses the call information recorded in the given
-  /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
-  /// as needed. Called by updateStackNodes which sets up the given
-  /// StackIdToMatchingCalls map.
-  void assignStackNodesPostOrder(
-      ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
-
-  /// Duplicates the given set of context ids, updating the provided
-  /// map from each original id with the newly generated context ids,
-  /// and returning the new duplicated id set.
-  DenseSet<uint32_t> duplicateContextIds(
-      const DenseSet<uint32_t> &StackSequenceContextIds,
-      DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
-
-  /// Propagates all duplicated context ids across the graph.
-  void propagateDuplicateContextIds(
-      const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
-
-  /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
-  /// else to its callers. Also updates OrigNode's edges to remove any context
-  /// ids moved to the newly created edge.
-  void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
-                      bool TowardsCallee);
-
-  /// Get the stack id corresponding to the given Id or Index (for IR this will
-  /// return itself, for a summary index this will return the id recorded in the
-  /// index for that stack id index value).
-  uint64_t getStackId(uint64_t IdOrIndex) const {
-    return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
-  }
-
-  /// Returns true if the given call targets the given function.
-  bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) {
-    return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(Call, Func);
-  }
-
-  /// Get a list of nodes corresponding to the stack ids in the given
-  /// callsite's context.
-  std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
-    return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
-        Call);
-  }
-
-  /// Get the last stack id in the context for callsite.
-  uint64_t getLastStackId(CallTy Call) {
-    return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
-  }
-
-  /// Gets a label to use in the dot graph for the given call clone in the given
-  /// function.
-  std::string getLabel(const FuncTy *Func, const CallTy Call,
-                       unsigned CloneNo) const {
-    return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
-  }
-
-  /// Helpers to find the node corresponding to the given call or stackid.
-  ContextNode *getNodeForInst(const CallInfo &C);
-  ContextNode *getNodeForAlloc(const CallInfo &C);
-  ContextNode *getNodeForStackId(uint64_t StackId);
-
-  /// Removes the node information recorded for the given call.
-  void unsetNodeForInst(const CallInfo &C);
-
-  /// Computes the alloc type corresponding to the given context ids, by
-  /// unioning their recorded alloc types.
-  uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
-
-  /// Map from each context ID to the AllocationType assigned to that context.
-  std::map<uint32_t, AllocationType> ContextIdToAllocationType;
-
-  /// Identifies the context node created for a stack id when adding the MIB
-  /// contexts to the graph. This is used to locate the context nodes when
-  /// trying to assign the corresponding callsites with those stack ids to these
-  /// nodes.
-  std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
-
-  /// Maps to track the calls to their corresponding nodes in the graph.
-  std::map<const CallInfo, ContextNode *> AllocationCallToContextNodeMap;
-  std::map<const CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
-
-  /// Owner of all ContextNode unique_ptrs.
-  std::vector<std::unique_ptr<ContextNode>> NodeOwner;
-
-  /// Perform sanity checks on graph when requested.
-  void check() const;
-
-  /// Keeps track of the last unique context id assigned.
-  unsigned int LastContextId = 0;
-};
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-using ContextNode =
-    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-using ContextEdge =
-    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-using FuncInfo =
-    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-using CallInfo =
-    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
-
-/// CRTP derived class for graphs built from IR (regular LTO).
-class ModuleCallsiteContextGraph
-    : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
-                                  Instruction *> {
-public:
-  ModuleCallsiteContextGraph(Module &M);
-
-private:
-  friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
-                              Instruction *>;
-
-  uint64_t getStackId(uint64_t IdOrIndex) const;
-  bool calleeMatchesFunc(Instruction *Call, const Function *Func);
-  uint64_t getLastStackId(Instruction *Call);
-  std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
-  std::string getLabel(const Function *Func, const Instruction *Call,
-                       unsigned CloneNo) const;
-
-  const Module &Mod;
-};
-
-namespace {
-
-struct FieldSeparator {
-  bool Skip = true;
-  const char *Sep;
-
-  FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
-};
-
-raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
-  if (FS.Skip) {
-    FS.Skip = false;
-    return OS;
-  }
-  return OS << FS.Sep;
-}
-
-} // end anonymous namespace
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-ContextNode<DerivedCCG, FuncTy, CallTy> *
-CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
-    const CallInfo &C) {
-  ContextNode *Node = getNodeForAlloc(C);
-  if (Node)
-    return Node;
-
-  auto NonAllocCallNode = NonAllocationCallToContextNodeMap.find(C);
-  if (NonAllocCallNode != NonAllocationCallToContextNodeMap.end()) {
-    return NonAllocCallNode->second;
-  }
-  return nullptr;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-ContextNode<DerivedCCG, FuncTy, CallTy> *
-CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
-    const CallInfo &C) {
-  auto AllocCallNode = AllocationCallToContextNodeMap.find(C);
-  if (AllocCallNode != AllocationCallToContextNodeMap.end()) {
-    return AllocCallNode->second;
-  }
-  return nullptr;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-ContextNode<DerivedCCG, FuncTy, CallTy> *
-CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
-    uint64_t StackId) {
-  auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
-  if (StackEntryNode != StackEntryIdToContextNodeMap.end())
-    return StackEntryNode->second;
-  return nullptr;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
-    const CallInfo &C) {
-  AllocationCallToContextNodeMap.erase(C) ||
-      NonAllocationCallToContextNodeMap.erase(C);
-  assert(!AllocationCallToContextNodeMap.count(C) &&
-         !NonAllocationCallToContextNodeMap.count(C));
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
-    addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
-                          unsigned int ContextId) {
-  for (auto &Edge : CallerEdges) {
-    if (Edge->Caller == Caller) {
-      Edge->AllocTypes |= (uint8_t)AllocType;
-      Edge->getContextIds().insert(ContextId);
-      return;
-    }
-  }
-  std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
-      this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
-  CallerEdges.push_back(Edge);
-  Caller->CalleeEdges.push_back(Edge);
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-ContextEdge<DerivedCCG, FuncTy, CallTy> *
-CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
-    findEdgeFromCallee(const ContextNode *Callee) {
-  for (const auto &Edge : CalleeEdges)
-    if (Edge->Callee == Callee)
-      return Edge.get();
-  return nullptr;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-ContextEdge<DerivedCCG, FuncTy, CallTy> *
-CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
-    findEdgeFromCaller(const ContextNode *Caller) {
-  for (const auto &Edge : CallerEdges)
-    if (Edge->Caller == Caller)
-      return Edge.get();
-  return nullptr;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
-    eraseCalleeEdge(const ContextEdge *Edge) {
-  auto EI =
-      std::find_if(CalleeEdges.begin(), CalleeEdges.end(),
-                   [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
-                     return CalleeEdge.get() == Edge;
-                   });
-  assert(EI != CalleeEdges.end());
-  CalleeEdges.erase(EI);
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
-    eraseCallerEdge(const ContextEdge *Edge) {
-  auto EI =
-      std::find_if(CallerEdges.begin(), CallerEdges.end(),
-                   [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
-                     return CallerEdge.get() == Edge;
-                   });
-  assert(EI != CallerEdges.end());
-  CallerEdges.erase(EI);
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
-    DenseSet<uint32_t> &ContextIds) {
-  uint8_t BothTypes =
-      (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
-  uint8_t AllocType = (uint8_t)AllocationType::None;
-  for (auto Id : ContextIds) {
-    AllocType |= (uint8_t)ContextIdToAllocationType[Id];
-    // Bail early if alloc type reached both, no further refinement.
-    if (AllocType == BothTypes)
-      return AllocType;
-  }
-  return AllocType;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-ContextNode<DerivedCCG, FuncTy, CallTy> *
-CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
-    CallInfo Call, const FuncTy *F) {
-  assert(!getNodeForAlloc(Call));
-  NodeOwner.push_back(
-      std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
-  ContextNode *AllocNode = NodeOwner.back().get();
-  AllocationCallToContextNodeMap[Call] = AllocNode;
-  NodeToCallingFunc[AllocNode] = F;
-  // Use LastContextId as a uniq id for MIB allocation nodes.
-  AllocNode->OrigStackOrAllocId = LastContextId;
-  // Alloc type should be updated as we add in the MIBs. We should assert
-  // afterwards that it is not still None.
-  AllocNode->AllocTypes = (uint8_t)AllocationType::None;
-
-  return AllocNode;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-template <class NodeT, class IteratorT>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
-    ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
-    CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
-  ContextIdToAllocationType[++LastContextId] = AllocType;
-
-  // Update alloc type and context ids for this MIB.
-  AllocNode->AllocTypes |= (uint8_t)AllocType;
-  AllocNode->ContextIds.insert(LastContextId);
-
-  // Now add or update nodes for each stack id in alloc's context.
-  // Later when processing the stack ids on non-alloc callsites we will adjust
-  // for any inlining in the context.
-  ContextNode *PrevNode = AllocNode;
-  // Look for recursion (direct recursion should have been collapsed by
-  // module summary analysis, here we should just be detecting mutual
-  // recursion). Mark these nodes so we don't try to clone.
-  SmallSet<uint64_t, 8> StackIdSet;
-  // Skip any on the allocation call (inlining).
-  for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
-       ContextIter != StackContext.end(); ++ContextIter) {
-    auto StackId = getStackId(*ContextIter);
-    ContextNode *StackNode = getNodeForStackId(StackId);
-    if (!StackNode) {
-      NodeOwner.push_back(
-          std::make_unique<ContextNode>(/*IsAllocation=*/false));
-      StackNode = NodeOwner.back().get();
-      StackEntryIdToContextNodeMap[StackId] = StackNode;
-      StackNode->OrigStackOrAllocId = StackId;
-    }
-    auto Ins = StackIdSet.insert(StackId);
-    if (!Ins.second)
-      StackNode->Recursive = true;
-    StackNode->ContextIds.insert(LastContextId);
-    StackNode->AllocTypes |= (uint8_t)AllocType;
-    PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
-    PrevNode = StackNode;
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-DenseSet<uint32_t>
-CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
-    const DenseSet<uint32_t> &StackSequenceContextIds,
-    DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
-  DenseSet<uint32_t> NewContextIds;
-  for (auto OldId : StackSequenceContextIds) {
-    NewContextIds.insert(++LastContextId);
-    OldToNewContextIds[OldId].insert(LastContextId);
-    assert(ContextIdToAllocationType.count(OldId));
-    // The new context has the same allocation type as original.
-    ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
-  }
-  return NewContextIds;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    propagateDuplicateContextIds(
-        const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
-  // Build a set of duplicated context ids corresponding to the input id set.
-  auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
-    DenseSet<uint32_t> NewIds;
-    for (auto Id : ContextIds)
-      if (auto NewId = OldToNewContextIds.find(Id);
-          NewId != OldToNewContextIds.end())
-        NewIds.insert(NewId->second.begin(), NewId->second.end());
-    return NewIds;
-  };
-
-  // Recursively update context ids sets along caller edges.
-  auto UpdateCallers = [&](ContextNode *Node,
-                           DenseSet<const ContextEdge *> &Visited,
-                           auto &&UpdateCallers) -> void {
-    for (auto Edge : Node->CallerEdges) {
-      auto Inserted = Visited.insert(Edge.get());
-      if (!Inserted.second)
-        continue;
-      ContextNode *NextNode = Edge->Caller;
-      DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
-      // Only need to recursively iterate to NextNode via this caller edge if
-      // it resulted in any added ids to NextNode.
-      if (!NewIdsToAdd.empty()) {
-        Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
-        NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
-        UpdateCallers(NextNode, Visited, UpdateCallers);
-      }
-    }
-  };
-
-  DenseSet<const ContextEdge *> Visited;
-  for (auto &Entry : AllocationCallToContextNodeMap) {
-    auto *Node = Entry.second;
-    // Update ids on the allocation nodes before calling the recursive
-    // update along caller edges, since this simplifies the logic during
-    // that traversal.
-    DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
-    Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
-    UpdateCallers(Node, Visited, UpdateCallers);
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
-    ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
-  // Make a copy of the context ids, since this will be adjusted below as they
-  // are moved.
-  DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
-  auto &OrigEdges =
-      TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
-  // Increment iterator in loop so that we can remove edges as needed.
-  for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
-    auto Edge = *EI;
-    // Remove any matching context ids from Edge, return set that were found and
-    // removed, these are the new edge's context ids. Also update the remaining
-    // (not found ids).
-    DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
-    set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
-                 NotFoundContextIds);
-    RemainingContextIds.swap(NotFoundContextIds);
-    // If no matching context ids for this edge, skip it.
-    if (NewEdgeContextIds.empty()) {
-      ++EI;
-      continue;
-    }
-    if (TowardsCallee) {
-      auto NewEdge = std::make_shared<ContextEdge>(
-          Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
-          NewEdgeContextIds);
-      NewNode->CalleeEdges.push_back(NewEdge);
-      NewEdge->Callee->CallerEdges.push_back(NewEdge);
-    } else {
-      auto NewEdge = std::make_shared<ContextEdge>(
-          NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
-          NewEdgeContextIds);
-      NewNode->CallerEdges.push_back(NewEdge);
-      NewEdge->Caller->CalleeEdges.push_back(NewEdge);
-    }
-    // Remove old edge if context ids empty.
-    if (Edge->getContextIds().empty()) {
-      if (TowardsCallee) {
-        Edge->Callee->eraseCallerEdge(Edge.get());
-        EI = OrigNode->CalleeEdges.erase(EI);
-      } else {
-        Edge->Caller->eraseCalleeEdge(Edge.get());
-        EI = OrigNode->CallerEdges.erase(EI);
-      }
-      continue;
-    }
-    ++EI;
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(ContextNode *Node,
-                              DenseSet<const ContextNode *> &Visited,
-                              DenseMap<uint64_t, std::vector<CallContextInfo>>
-                                  &StackIdToMatchingCalls) {
-  auto Inserted = Visited.insert(Node);
-  if (!Inserted.second)
-    return;
-  // Post order traversal. Iterate over a copy since we may add nodes and
-  // therefore new callers during the recursive call, invalidating any
-  // iterator over the original edge vector. We don't need to process these
-  // new nodes as they were already processed on creation.
-  auto CallerEdges = Node->CallerEdges;
-  for (auto &Edge : CallerEdges) {
-    // Skip any that have been removed during the recursion.
-    if (!Edge)
-      continue;
-    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
-  }
-
-  // If this node's stack id is in the map, update the graph to contain new
-  // nodes representing any inlining at interior callsites. Note we move the
-  // associated context ids over to the new nodes.
-
-  // Ignore this node if it is for an allocation or we didn't record any
-  // stack id lists ending at it.
-  if (Node->IsAllocation ||
-      !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
-    return;
-
-  auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
-  // Handle the simple case first. A single call with a single stack id.
-  // In this case there is no need to create any new context nodes, simply
-  // assign the context node for stack id to this Call.
-  if (Calls.size() == 1) {
-    auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
-    if (Ids.size() == 1) {
-      assert(SavedContextIds.empty());
-      // It should be this Node
-      assert(Node == getNodeForStackId(Ids[0]));
-      if (Node->Recursive)
-        return;
-      Node->setCall(Call);
-      NonAllocationCallToContextNodeMap[Call] = Node;
-      NodeToCallingFunc[Node] = Func;
-      return;
-    }
-  }
-
-  // Find the node for the last stack id, which should be the same
-  // across all calls recorded for this id, and is this node's id.
-  uint64_t LastId = Node->OrigStackOrAllocId;
-  ContextNode *LastNode = getNodeForStackId(LastId);
-  // We should only have kept stack ids that had nodes.
-  assert(LastNode);
-
-  for (unsigned I = 0; I < Calls.size(); I++) {
-    auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
-    // Skip any for which we didn't assign any ids, these don't get a node in
-    // the graph.
-    if (SavedContextIds.empty())
-      continue;
-
-    assert(LastId == Ids.back());
-
-    ContextNode *FirstNode = getNodeForStackId(Ids[0]);
-    assert(FirstNode);
-
-    // Recompute the context ids for this stack id sequence (the
-    // intersection of the context ids of the corresponding nodes).
-    // Start with the ids we saved in the map for this call, which could be
-    // duplicated context ids. We have to recompute as we might have overlap
-    // overlap between the saved context ids for different last nodes, and
-    // removed them already during the post order traversal.
-    set_intersect(SavedContextIds, FirstNode->ContextIds);
-    ContextNode *PrevNode = nullptr;
-    for (auto Id : Ids) {
-      ContextNode *CurNode = getNodeForStackId(Id);
-      // We should only have kept stack ids that had nodes and weren't
-      // recursive.
-      assert(CurNode);
-      assert(!CurNode->Recursive);
-      if (!PrevNode) {
-        PrevNode = CurNode;
-        continue;
-      }
-      auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
-      if (!Edge) {
-        SavedContextIds.clear();
-        break;
-      }
-      PrevNode = CurNode;
-      set_intersect(SavedContextIds, Edge->getContextIds());
-
-      // If we now have no context ids for clone, skip this call.
-      if (SavedContextIds.empty())
-        break;
-    }
-    if (SavedContextIds.empty())
-      continue;
-
-    // Create new context node.
-    NodeOwner.push_back(
-        std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
-    ContextNode *NewNode = NodeOwner.back().get();
-    NodeToCallingFunc[NewNode] = Func;
-    NonAllocationCallToContextNodeMap[Call] = NewNode;
-    NewNode->ContextIds = SavedContextIds;
-    NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
-
-    // Connect to callees of innermost stack frame in inlined call chain.
-    // This updates context ids for FirstNode's callee's to reflect those
-    // moved to NewNode.
-    connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
-
-    // Connect to callers of outermost stack frame in inlined call chain.
-    // This updates context ids for FirstNode's caller's to reflect those
-    // moved to NewNode.
-    connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
-
-    // Now we need to remove context ids from edges/nodes between First and
-    // Last Node.
-    PrevNode = nullptr;
-    for (auto Id : Ids) {
-      ContextNode *CurNode = getNodeForStackId(Id);
-      // We should only have kept stack ids that had nodes.
-      assert(CurNode);
-
-      // Remove the context ids moved to NewNode from CurNode, and the
-      // edge from the prior node.
-      set_subtract(CurNode->ContextIds, NewNode->ContextIds);
-      if (PrevNode) {
-        auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
-        assert(PrevEdge);
-        set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
-        if (PrevEdge->getContextIds().empty()) {
-          PrevNode->eraseCallerEdge(PrevEdge);
-          CurNode->eraseCalleeEdge(PrevEdge);
-        }
-      }
-      PrevNode = CurNode;
-    }
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
-  // Map of stack id to all calls with that as the last (outermost caller)
-  // callsite id that has a context node (some might not due to pruning
-  // performed during matching of the allocation profile contexts).
-  // The CallContextInfo contains the Call and a list of its stack ids with
-  // ContextNodes, the function containing Call, and the set of context ids
-  // the analysis will eventually identify for use in any new node created
-  // for that callsite.
-  DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
-  for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
-    for (auto &Call : CallsWithMetadata) {
-      // Ignore allocations, already handled.
-      if (AllocationCallToContextNodeMap.count(Call))
-        continue;
-      auto StackIdsWithContextNodes =
-          getStackIdsWithContextNodesForCall(Call.call());
-      // If there were no nodes created for MIBs on allocs (maybe this was in
-      // the unambiguous part of the MIB stack that was pruned), ignore.
-      if (StackIdsWithContextNodes.empty())
-        continue;
-      // Otherwise, record this Call along with the list of ids for the last
-      // (outermost caller) stack id with a node.
-      StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
-          {Call.call(), StackIdsWithContextNodes, Func, {}});
-    }
-  }
-
-  // First make a pass through all stack ids that correspond to a call,
-  // as identified in the above loop. Compute the context ids corresponding to
-  // each of these calls when they correspond to multiple stack ids due to
-  // due to inlining. Perform any duplication of context ids required when
-  // there is more than one call with the same stack ids. Their (possibly newly
-  // duplicated) context ids are saved in the StackIdToMatchingCalls map.
-  DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
-  for (auto &It : StackIdToMatchingCalls) {
-    auto &Calls = It.getSecond();
-    // Skip single calls with a single stack id. These don't need a new node.
-    if (Calls.size() == 1) {
-      auto &Ids = std::get<1>(Calls[0]);
-      if (Ids.size() == 1)
-        continue;
-    }
-    // In order to do the best and maximal matching of inlined calls to context
-    // node sequences we will sort the vectors of stack ids in descending order
-    // of length, and within each length, lexicographically by stack id. The
-    // latter is so that we can specially handle calls that have identical stack
-    // id sequences (either due to cloning or artificially because of the MIB
-    // context pruning).
-    std::sort(Calls.begin(), Calls.end(),
-              [](const CallContextInfo &A, const CallContextInfo &B) {
-                auto &IdsA = std::get<1>(A);
-                auto &IdsB = std::get<1>(B);
-                return IdsA.size() > IdsB.size() ||
-                       (IdsA.size() == IdsB.size() && IdsA < IdsB);
-              });
-
-    // Find the node for the last stack id, which should be the same
-    // across all calls recorded for this id, and is the id for this
-    // entry in the StackIdToMatchingCalls map.
-    uint64_t LastId = It.getFirst();
-    ContextNode *LastNode = getNodeForStackId(LastId);
-    // We should only have kept stack ids that had nodes.
-    assert(LastNode);
-
-    if (LastNode->Recursive)
-      continue;
-
-    // Initialize the context ids with the last node's. We will subsequently
-    // refine the context ids by computing the intersection along all edges.
-    DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
-    assert(!LastNodeContextIds.empty());
-
-    for (unsigned I = 0; I < Calls.size(); I++) {
-      auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
-      assert(SavedContextIds.empty());
-      assert(LastId == Ids.back());
-
-      // First compute the context ids for this stack id sequence (the
-      // intersection of the context ids of the corresponding nodes).
-      // Start with the remaining saved ids for the last node.
-      assert(!LastNodeContextIds.empty());
-      DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
-
-      ContextNode *PrevNode = LastNode;
-      ContextNode *CurNode = LastNode;
-      bool Skip = false;
-
-      // Iterate backwards through the stack Ids, starting after the last Id
-      // in the list, which was handled once outside for all Calls.
-      for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
-        auto Id = *IdIter;
-        CurNode = getNodeForStackId(Id);
-        // We should only have kept stack ids that had nodes.
-        assert(CurNode);
-
-        if (CurNode->Recursive) {
-          Skip = true;
-          break;
-        }
-
-        auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
-        // If there is no edge then the nodes belong to different MIB contexts,
-        // and we should skip this inlined context sequence. For example, this
-        // particular inlined context may include stack ids A->B, and we may
-        // indeed have nodes for both A and B, but it is possible that they were
-        // never profiled in sequence in a single MIB for any allocation (i.e.
-        // we might have profiled an allocation that involves the callsite A,
-        // but through a different one of its callee callsites, and we might
-        // have profiled an allocation that involves callsite B, but reached
-        // from a different caller callsite).
-        if (!Edge) {
-          Skip = true;
-          break;
-        }
-        PrevNode = CurNode;
-
-        // Update the context ids, which is the intersection of the ids along
-        // all edges in the sequence.
-        set_intersect(StackSequenceContextIds, Edge->getContextIds());
-
-        // If we now have no context ids for clone, skip this call.
-        if (StackSequenceContextIds.empty()) {
-          Skip = true;
-          break;
-        }
-      }
-      if (Skip)
-        continue;
-
-      // If some of this call's stack ids did not have corresponding nodes (due
-      // to pruning), don't include any context ids for contexts that extend
-      // beyond these nodes. Otherwise we would be matching part of unrelated /
-      // not fully matching stack contexts. To do this, subtract any context ids
-      // found in caller nodes of the last node found above.
-      if (Ids.back() != getLastStackId(Call)) {
-        for (auto PE : LastNode->CallerEdges) {
-          set_subtract(StackSequenceContextIds, PE->getContextIds());
-          if (StackSequenceContextIds.empty())
-            break;
-        }
-        // If we now have no context ids for clone, skip this call.
-        if (StackSequenceContextIds.empty())
-          continue;
-      }
-
-      // Check if the next set of stack ids is the same (since the Calls vector
-      // of tuples is sorted by the stack ids we can just look at the next one).
-      bool DuplicateContextIds = false;
-      if (I + 1 < Calls.size()) {
-        auto NextIds = std::get<1>(Calls[I + 1]);
-        DuplicateContextIds = Ids == NextIds;
-      }
-
-      // If we don't have duplicate context ids, then we can assign all the
-      // context ids computed for the original node sequence to this call.
-      // If there are duplicate calls with the same stack ids then we synthesize
-      // new context ids that are duplicates of the originals. These are
-      // assigned to SavedContextIds, which is a reference into the map entry
-      // for this call, allowing us to access these ids later on.
-      OldToNewContextIds.reserve(OldToNewContextIds.size() +
-                                 StackSequenceContextIds.size());
-      SavedContextIds =
-          DuplicateContextIds
-              ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
-              : StackSequenceContextIds;
-      assert(!SavedContextIds.empty());
-
-      if (!DuplicateContextIds) {
-        // Update saved last node's context ids to remove those that are
-        // assigned to other calls, so that it is ready for the next call at
-        // this stack id.
-        set_subtract(LastNodeContextIds, StackSequenceContextIds);
-        if (LastNodeContextIds.empty())
-          break;
-      }
-    }
-  }
-
-  // Propagate the duplicate context ids over the graph.
-  propagateDuplicateContextIds(OldToNewContextIds);
-
-  if (VerifyCCG)
-    check();
-
-  // Now perform a post-order traversal over the graph, starting with the
-  // allocation nodes, essentially processing nodes from callers to callees.
-  // For any that contains an id in the map, update the graph to contain new
-  // nodes representing any inlining at interior callsites. Note we move the
-  // associated context ids over to the new nodes.
-  DenseSet<const ContextNode *> Visited;
-  for (auto &Entry : AllocationCallToContextNodeMap)
-    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
-}
-
-uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
-  CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
-      Call->getMetadata(LLVMContext::MD_callsite));
-  return CallsiteContext.back();
-}
-
-std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
-                                                 const Instruction *Call,
-                                                 unsigned CloneNo) const {
-  return (Twine(Call->getFunction()->getName()) + " -> " +
-          cast<CallBase>(Call)->getCalledFunction()->getName())
-      .str();
-}
-
-std::vector<uint64_t>
-ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
-    Instruction *Call) {
-  CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
-      Call->getMetadata(LLVMContext::MD_callsite));
-  return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
-      CallsiteContext);
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-template <class NodeT, class IteratorT>
-std::vector<uint64_t>
-CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
-    CallStack<NodeT, IteratorT> &CallsiteContext) {
-  std::vector<uint64_t> StackIds;
-  for (auto IdOrIndex : CallsiteContext) {
-    auto StackId = getStackId(IdOrIndex);
-    ContextNode *Node = getNodeForStackId(StackId);
-    if (!Node)
-      break;
-    StackIds.push_back(StackId);
-  }
-  return StackIds;
-}
-
-ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
-  for (auto &F : M) {
-    std::vector<CallInfo> CallsWithMetadata;
-    for (auto &BB : F) {
-      for (auto &I : BB) {
-        if (!isa<CallBase>(I))
-          continue;
-        if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
-          CallsWithMetadata.push_back(&I);
-          auto *AllocNode = addAllocNode(&I, &F);
-          auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
-          assert(CallsiteMD);
-          CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
-          // Add all of the MIBs and their stack nodes.
-          for (auto &MDOp : MemProfMD->operands()) {
-            auto *MIBMD = cast<const MDNode>(MDOp);
-            MDNode *StackNode = getMIBStackNode(MIBMD);
-            assert(StackNode);
-            CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
-            addStackNodesForMIB<MDNode, MDNode::op_iterator>(
-                AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD));
-          }
-          assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
-          // Memprof and callsite metadata on memory allocations no longer
-          // needed.
-          I.setMetadata(LLVMContext::MD_memprof, nullptr);
-          I.setMetadata(LLVMContext::MD_callsite, nullptr);
-        }
-        // For callsite metadata, add to list for this function for later use.
-        else if (I.getMetadata(LLVMContext::MD_callsite))
-          CallsWithMetadata.push_back(&I);
-      }
-    }
-    if (!CallsWithMetadata.empty())
-      FuncToCallsWithMetadata.push_back({&F, CallsWithMetadata});
-  }
-
-  if (DumpCCG) {
-    dbgs() << "CCG before updating call stack chains:\n";
-    dbgs() << *this;
-  }
-
-  if (ExportToDot)
-    exportToDot("prestackupdate");
-
-  updateStackNodes();
-
-  handleCallsitesWithMultipleTargets();
-
-  // Strip off remaining callsite metadata, no longer needed.
-  for (auto &FuncEntry : FuncToCallsWithMetadata)
-    for (auto &Call : FuncEntry.second)
-      Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy,
-                          CallTy>::handleCallsitesWithMultipleTargets() {
-  // Look for and workaround callsites that call multiple functions.
-  // This can happen for indirect calls, which needs better handling, and in
-  // more rare cases (e.g. macro expansion).
-  // TODO: To fix this for indirect calls we will want to perform speculative
-  // devirtualization using either the normal PGO info with ICP, or using the
-  // information in the profiled MemProf contexts. We can do this prior to
-  // this transformation for regular LTO, and for ThinLTO we can simulate that
-  // effect in the summary and perform the actual speculative devirtualization
-  // while cloning in the ThinLTO backend.
-  for (auto Entry = NonAllocationCallToContextNodeMap.begin();
-       Entry != NonAllocationCallToContextNodeMap.end();) {
-    auto *Node = Entry->second;
-    assert(Node->Clones.empty());
-    // Check all node callees and see if in the same function.
-    bool Removed = false;
-    auto Call = Node->Call.call();
-    for (auto &Edge : Node->CalleeEdges) {
-      if (!Edge->Callee->hasCall())
-        continue;
-      assert(NodeToCallingFunc.count(Edge->Callee));
-      // Check if the called function matches that of the callee node.
-      if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee]))
-        continue;
-      // Work around by setting Node to have a null call, so it gets
-      // skipped during cloning. Otherwise assignFunctions will assert
-      // because its data structures are not designed to handle this case.
-      Entry = NonAllocationCallToContextNodeMap.erase(Entry);
-      Node->setCall(CallInfo());
-      Removed = true;
-      break;
-    }
-    if (!Removed)
-      Entry++;
-  }
-}
-
-uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
-  // In the Module (IR) case this is already the Id.
-  return IdOrIndex;
-}
-
-bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
-                                                   const Function *Func) {
-  auto *CB = dyn_cast<CallBase>(Call);
-  if (!CB->getCalledOperand())
-    return false;
-  auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
-  auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
-  if (CalleeFunc == Func)
-    return true;
-  auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
-  return Alias && Alias->getAliasee() == Func;
-}
-
-static std::string getAllocTypeString(uint8_t AllocTypes) {
-  if (!AllocTypes)
-    return "None";
-  std::string Str;
-  if (AllocTypes & (uint8_t)AllocationType::NotCold)
-    Str += "NotCold";
-  if (AllocTypes & (uint8_t)AllocationType::Cold)
-    Str += "Cold";
-  return Str;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
-    const {
-  print(dbgs());
-  dbgs() << "\n";
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
-    raw_ostream &OS) const {
-  OS << "Node " << this << "\n";
-  OS << "\t";
-  printCall(OS);
-  if (Recursive)
-    OS << " (recursive)";
-  OS << "\n";
-  OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
-  OS << "\tContextIds:";
-  std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
-  std::sort(SortedIds.begin(), SortedIds.end());
-  for (auto Id : SortedIds)
-    OS << " " << Id;
-  OS << "\n";
-  OS << "\tCalleeEdges:\n";
-  for (auto &Edge : CalleeEdges)
-    OS << "\t\t" << *Edge << "\n";
-  OS << "\tCallerEdges:\n";
-  for (auto &Edge : CallerEdges)
-    OS << "\t\t" << *Edge << "\n";
-  if (!Clones.empty()) {
-    OS << "\tClones: ";
-    FieldSeparator FS;
-    for (auto *Clone : Clones)
-      OS << FS << Clone;
-    OS << "\n";
-  } else if (CloneOf) {
-    OS << "\tClone of " << CloneOf << "\n";
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
-    const {
-  print(dbgs());
-  dbgs() << "\n";
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
-    raw_ostream &OS) const {
-  OS << "Edge from Callee " << Callee << " to Caller: " << Caller
-     << " AllocTypes: " << getAllocTypeString(AllocTypes);
-  OS << " ContextIds:";
-  std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
-  std::sort(SortedIds.begin(), SortedIds.end());
-  for (auto Id : SortedIds)
-    OS << " " << Id;
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
-  print(dbgs());
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
-    raw_ostream &OS) const {
-  OS << "Callsite Context Graph:\n";
-  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
-  for (const auto Node : nodes<GraphType>(this)) {
-    if (Node->isRemoved())
-      continue;
-    Node->print(OS);
-    OS << "\n";
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-static void checkEdge(
-    const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
-  // Confirm that alloc type is not None and that we have at least one context
-  // id.
-  assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
-  assert(!Edge->ContextIds.empty());
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node) {
-  if (Node->isRemoved())
-    return;
-  // Node's context ids should be the union of both its callee and caller edge
-  // context ids.
-  if (Node->CallerEdges.size()) {
-    auto EI = Node->CallerEdges.begin();
-    auto &FirstEdge = *EI;
-    EI++;
-    DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
-    for (; EI != Node->CallerEdges.end(); EI++) {
-      const auto &Edge = *EI;
-      set_union(CallerEdgeContextIds, Edge->ContextIds);
-    }
-    // Node can have more context ids than callers if some contexts terminate at
-    // node and some are longer.
-    assert(Node->ContextIds == CallerEdgeContextIds ||
-           set_is_subset(CallerEdgeContextIds, Node->ContextIds));
-  }
-  if (Node->CalleeEdges.size()) {
-    auto EI = Node->CalleeEdges.begin();
-    auto &FirstEdge = *EI;
-    EI++;
-    DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
-    for (; EI != Node->CalleeEdges.end(); EI++) {
-      const auto &Edge = *EI;
-      set_union(CalleeEdgeContextIds, Edge->ContextIds);
-    }
-    assert(Node->ContextIds == CalleeEdgeContextIds);
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
-  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
-  for (const auto Node : nodes<GraphType>(this)) {
-    checkNode<DerivedCCG, FuncTy, CallTy>(Node);
-    for (auto &Edge : Node->CallerEdges)
-      checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
-  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
-  using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
-
-  using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
-  static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
-
-  using nodes_iterator =
-      mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
-                      decltype(&getNode)>;
-
-  static nodes_iterator nodes_begin(GraphType G) {
-    return nodes_iterator(G->NodeOwner.begin(), &getNode);
-  }
-
-  static nodes_iterator nodes_end(GraphType G) {
-    return nodes_iterator(G->NodeOwner.end(), &getNode);
-  }
-
-  static NodeRef getEntryNode(GraphType G) {
-    return G->NodeOwner.begin()->get();
-  }
-
-  using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
-  static const ContextNode<DerivedCCG, FuncTy, CallTy> *
-  GetCallee(const EdgePtrTy &P) {
-    return P->Callee;
-  }
-
-  using ChildIteratorType =
-      mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
-                          DerivedCCG, FuncTy, CallTy>>>::const_iterator,
-                      decltype(&GetCallee)>;
-
-  static ChildIteratorType child_begin(NodeRef N) {
-    return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
-  }
-
-  static ChildIteratorType child_end(NodeRef N) {
-    return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
-  }
-};
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
-    : public DefaultDOTGraphTraits {
-  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
-
-  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
-  using GTraits = GraphTraits<GraphType>;
-  using NodeRef = typename GTraits::NodeRef;
-  using ChildIteratorType = typename GTraits::ChildIteratorType;
-
-  static std::string getNodeLabel(NodeRef Node, GraphType G) {
-    std::string LabelString =
-        (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
-         Twine(Node->OrigStackOrAllocId))
-            .str();
-    LabelString += "\n";
-    if (Node->hasCall()) {
-      auto Func = G->NodeToCallingFunc.find(Node);
-      assert(Func != G->NodeToCallingFunc.end());
-      LabelString +=
-          G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
-    } else {
-      LabelString += "null call";
-      if (Node->Recursive)
-        LabelString += " (recursive)";
-      else
-        LabelString += " (external)";
-    }
-    return LabelString;
-  }
-
-  static std::string getNodeAttributes(NodeRef Node, GraphType) {
-    std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
-                                   getContextIds(Node->ContextIds) + "\"")
-                                      .str();
-    AttributeString +=
-        (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
-    AttributeString += ",style=\"filled\"";
-    if (Node->CloneOf) {
-      AttributeString += ",color=\"blue\"";
-      AttributeString += ",style=\"filled,bold,dashed\"";
-    } else
-      AttributeString += ",style=\"filled\"";
-    return AttributeString;
-  }
-
-  static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
-                                       GraphType) {
-    auto &Edge = *(ChildIter.getCurrent());
-    return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
-            Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
-        .str();
-  }
-
-  // Since the NodeOwners list includes nodes that are no longer connected to
-  // the graph, skip them here.
-  static bool isNodeHidden(NodeRef Node, GraphType) {
-    return Node->isRemoved();
-  }
-
-private:
-  static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
-    std::string IdString = "ContextIds:";
-    if (ContextIds.size() < 100) {
-      std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
-      std::sort(SortedIds.begin(), SortedIds.end());
-      for (auto Id : SortedIds)
-        IdString += (" " + Twine(Id)).str();
-    } else {
-      IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
-    }
-    return IdString;
-  }
-
-  static std::string getColor(uint8_t AllocTypes) {
-    if (AllocTypes == (uint8_t)AllocationType::NotCold)
-      // Color "brown1" actually looks like a lighter red.
-      return "brown1";
-    if (AllocTypes == (uint8_t)AllocationType::Cold)
-      return "cyan";
-    if (AllocTypes ==
-        ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
-      // Lighter purple.
-      return "mediumorchid1";
-    return "gray";
-  }
-
-  static std::string getNodeId(NodeRef Node) {
-    std::stringstream SStream;
-    SStream << std::hex << "N0x" << (unsigned long long)Node;
-    std::string Result = SStream.str();
-    return Result;
-  }
-};
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
-    std::string Label) const {
-  WriteGraph(this, "", false, Label,
-             DotFilePathPrefix + "ccg." + Label + ".dot");
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
-  if (DumpCCG) {
-    dbgs() << "CCG before cloning:\n";
-    dbgs() << *this;
-  }
-  if (ExportToDot)
-    exportToDot("postbuild");
-
-  if (VerifyCCG) {
-    check();
-  }
-
-  return false;
-}
-
-bool MemProfContextDisambiguation::processModule(Module &M) {
-  bool Changed = false;
-
-  ModuleCallsiteContextGraph CCG(M);
-  Changed = CCG.process();
-
-  return Changed;
-}
-
-PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
-                                                    ModuleAnalysisManager &AM) {
-  if (!processModule(M))
-    return PreservedAnalyses::all();
-  return PreservedAnalyses::none();
-}
diff --git a/llvm/test/ThinLTO/X86/memprof-summary.ll b/llvm/test/ThinLTO/X86/memprof-summary.ll
new file mode 100644
index 0000000000000..597cd44c030e7
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-summary.ll
@@ -0,0 +1,184 @@
+;; Check memprof summaries (per module, combined index, and distributed indexes)
+
+; RUN: split-file %s %t
+; RUN: opt -module-summary %t/a.ll -o %ta.bc
+; RUN: opt -module-summary %t/b.ll -o %tb.bc
+
+; RUN: llvm-dis -o - %ta.bc | FileCheck %s --check-prefix=PRELINKDISA
+; PRELINKDISA: gv: (name: "main", {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438)))))) ; guid = 15822663052811949562
+
+; RUN: llvm-dis -o - %tb.bc | FileCheck %s --check-prefix=PRELINKDISB
+; PRELINKDISB: ^[[PLBAR:[0-9]+]] = gv: (name: "_Z3barv", {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) ; guid = 4555904644815367798
+; PRELINKDISB: ^[[PLFOO:[0-9]+]] = gv: (name: "_Z3foov", {{.*}} callsites: ((callee: ^[[PLBAZ:[0-9]+]], clones: (0), stackIds: (2732490490862098848)))))) ; guid = 9191153033785521275
+; PRELINKDISB: ^[[PLBAZ]] = gv: (name: "_Z3bazv", {{.*}} callsites: ((callee: ^[[PLBAR]], clones: (0), stackIds: (12481870273128938184)))))) ; guid = 15176620447596392000
+
+; RUN: llvm-bcanalyzer -dump %ta.bc | FileCheck %s --check-prefix=PRELINKBCANA
+; PRELINKBCANA: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178/>
+
+; RUN: llvm-bcanalyzer -dump %tb.bc | FileCheck %s --check-prefix=PRELINKBCANB
+; PRELINKBCANB: <STACK_IDS abbrevid=4 op0=-5964873800580613432 op1=2732490490862098848 op2=8632435727821051414 op3=-3421689549917153178/>
+
+; RUN: llvm-lto2 run %ta.bc %tb.bc -o %t -save-temps \
+; RUN:     -thinlto-distributed-indexes \
+; RUN:     -r=%ta.bc,main,plx \
+; RUN:     -r=%ta.bc,_Z3foov, \
+; RUN:     -r=%ta.bc,free, \
+; RUN:     -r=%ta.bc,sleep, \
+; RUN:     -r=%tb.bc,_Z3foov,pl \
+; RUN:     -r=%tb.bc,_Znam, \
+; RUN:     -r=%tb.bc,_Z3bazv,pl
+
+; RUN: llvm-dis -o - %t.index.bc | FileCheck %s --check-prefix=COMBINEDDIS
+; COMBINEDDIS: ^[[COMBBAR:[0-9]+]] = gv: (guid: 4555904644815367798, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438))))))))
+; COMBINEDDIS: ^[[COMBFOO:[0-9]+]] = gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^[[COMBBAZ:[0-9]+]], clones: (0), stackIds: (2732490490862098848))))))
+; COMBINEDDIS: ^[[COMBBAZ]] = gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^[[COMBBAR]], clones: (0), stackIds: (12481870273128938184))))))
+; COMBINEDDIS: ^[[COMBMAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[COMBFOO]], clones: (0), stackIds: (8632435727821051414)), (callee: ^[[COMBFOO]], clones: (0), stackIds: (15025054523792398438))))))
+
+; RUN: llvm-bcanalyzer -dump %t.index.bc | FileCheck %s --check-prefix=COMBINEDBCAN
+; COMBINEDBCAN: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=-5964873800580613432 op3=2732490490862098848/>
+
+; RUN: llvm-dis -o - %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISA
+; DISTRIBUTEDDISA: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: null, clones: (0), stackIds: (2732490490862098848))))))
+; DISTRIBUTEDDISA: gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438))))))
+
+; RUN: llvm-dis -o - %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISB
+; DISTRIBUTEDDISB: ^[[DISTRBAR:[0-9]+]] = gv: (guid: 4555904644815367798, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438))))))))
+; DISTRIBUTEDDISB: ^[[DISTRFOO:[0-9]+]] = gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^[[DISTRBAZ:[0-9]+]], clones: (0), stackIds: (2732490490862098848))))))
+; DISTRIBUTEDDISB: ^[[DISTRBAZ]] = gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^[[DISTRBAR]], clones: (0), stackIds: (12481870273128938184))))))
+
+; RUN: llvm-bcanalyzer -dump %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANA
+; DISTRIBUTEDBCANA: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=2732490490862098848/>
+
+; RUN: llvm-bcanalyzer -dump %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANB
+; DISTRIBUTEDBCANB: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=-5964873800580613432 op3=2732490490862098848/>
+
+;--- a.ll
+; ModuleID = 'a.cc'
+source_filename = "a.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress norecurse uwtable
+define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #0 !dbg !39 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !dbg !42, !callsite !43
+  %call1 = call noundef ptr @_Z3foov(), !dbg !44, !callsite !45
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call, i8 0, i64 10, i1 false), !dbg !46
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call1, i8 0, i64 10, i1 false), !dbg !47
+  call void @free(ptr noundef %call) #4, !dbg !48
+  %call2 = call i32 @sleep(i32 noundef 10), !dbg !49
+  call void @free(ptr noundef %call1) #4, !dbg !50
+  ret i32 0, !dbg !51
+}
+
+declare !dbg !52 noundef ptr @_Z3foov() local_unnamed_addr #1
+
+; Function Attrs: argmemonly mustprogress nocallback nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free")
+declare void @free(ptr allocptr nocapture noundef) local_unnamed_addr #3
+
+declare !dbg !53 i32 @sleep(i32 noundef) local_unnamed_addr #1
+
+attributes #0 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { argmemonly mustprogress nocallback nofree nounwind willreturn writeonly }
+attributes #3 = { inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free") "alloc-family"="malloc" "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git@github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "a.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ebabd56909271a1d4a7cac81c10624d5")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!39 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!40 = !DISubroutineType(types: !41)
+!41 = !{}
+!42 = !DILocation(line: 6, column: 13, scope: !39)
+!43 = !{i64 8632435727821051414}
+!44 = !DILocation(line: 7, column: 13, scope: !39)
+!45 = !{i64 -3421689549917153178}
+!46 = !DILocation(line: 8, column: 3, scope: !39)
+!47 = !DILocation(line: 9, column: 3, scope: !39)
+!48 = !DILocation(line: 10, column: 3, scope: !39)
+!49 = !DILocation(line: 11, column: 3, scope: !39)
+!50 = !DILocation(line: 12, column: 3, scope: !39)
+!51 = !DILocation(line: 13, column: 3, scope: !39)
+!52 = !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41)
+!53 = !DISubprogram(name: "sleep", scope: !54, file: !54, line: 453, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41)
+!54 = !DIFile(filename: "include/unistd.h", directory: "/usr", checksumkind: CSK_MD5, checksum: "ee8f41a17f563f029d0e930ad871815a")
+
+;--- b.ll
+; ModuleID = 'b.cc'
+source_filename = "b.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress noinline uwtable
+define internal noalias noundef nonnull ptr @_Z3barv() local_unnamed_addr #0 !dbg !39 {
+entry:
+  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #2, !dbg !42, !memprof !43, !callsite !48
+  ret ptr %call, !dbg !49
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znam(i64 noundef) local_unnamed_addr #1
+
+; Function Attrs: mustprogress noinline uwtable
+define dso_local noalias noundef nonnull ptr @_Z3bazv() local_unnamed_addr #0 !dbg !50 {
+entry:
+  %call = call noundef ptr @_Z3barv(), !dbg !51, !callsite !52
+  ret ptr %call, !dbg !53
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local noalias noundef nonnull ptr @_Z3foov() local_unnamed_addr #3 !dbg !54 {
+entry:
+  %call = call noundef ptr @_Z3bazv(), !dbg !55, !callsite !56
+  ret ptr %call, !dbg !57
+}
+
+attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { builtin allocsize(0) }
+attributes #3 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git@github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "b.cc", directory: ".", checksumkind: CSK_MD5, checksum: "335f81d275af57725cfc9ffc7be49bc2")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!39 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 1, type: !40, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!40 = !DISubroutineType(types: !41)
+!41 = !{}
+!42 = !DILocation(line: 2, column: 10, scope: !39)
+!43 = !{!44, !46}
+!44 = !{!45, !"notcold"}
+!45 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!46 = !{!47, !"cold"}
+!47 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!48 = !{i64 9086428284934609951}
+!49 = !DILocation(line: 2, column: 3, scope: !39)
+!50 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!51 = !DILocation(line: 6, column: 10, scope: !50)
+!52 = !{i64 -5964873800580613432}
+!53 = !DILocation(line: 6, column: 3, scope: !50)
+!54 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 9, type: !40, scopeLine: 9, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!55 = !DILocation(line: 10, column: 10, scope: !54)
+!56 = !{i64 2732490490862098848}
+!57 = !DILocation(line: 10, column: 3, scope: !54)
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
deleted file mode 100644
index 539d88a815ed1..0000000000000
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-;; Test callsite context graph generation for simple call graph with
-;; two memprof contexts and no inlining.
-;;
-;; Original code looks like:
-;;
-;; char *bar() {
-;;   return new char[10];
-;; }
-;;
-;; char *baz() {
-;;   return bar();
-;; }
-;;
-;; char *foo() {
-;;   return baz();
-;; }
-;;
-;; int main(int argc, char **argv) {
-;;   char *x = foo();
-;;   char *y = foo();
-;;   memset(x, 0, 10);
-;;   memset(y, 0, 10);
-;;   delete[] x;
-;;   sleep(10);
-;;   delete[] y;
-;;   return 0;
-;; }
-;;
-;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
-;; memory freed after sleep(10) results in cold lifetimes.
-;;
-;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
-
-; RUN: opt -passes=memprof-context-disambiguation \
-; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
-
-; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @main() #0 {
-entry:
-  %call = call noundef ptr @_Z3foov(), !callsite !0
-  %call1 = call noundef ptr @_Z3foov(), !callsite !1
-  ret i32 0
-}
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
-
-; Function Attrs: nobuiltin
-declare void @_ZdaPv() #2
-
-define internal ptr @_Z3barv() #3 {
-entry:
-  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !2, !callsite !7
-  ret ptr null
-}
-
-declare ptr @_Znam(i64)
-
-define internal ptr @_Z3bazv() #4 {
-entry:
-  %call = call noundef ptr @_Z3barv(), !callsite !8
-  ret ptr null
-}
-
-; Function Attrs: noinline
-define internal ptr @_Z3foov() #5 {
-entry:
-  %call = call noundef ptr @_Z3bazv(), !callsite !9
-  ret ptr null
-}
-
-; uselistorder directives
-uselistorder ptr @_Z3foov, { 1, 0 }
-
-attributes #0 = { "tune-cpu"="generic" }
-attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) }
-attributes #2 = { nobuiltin }
-attributes #3 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
-attributes #4 = { "stack-protector-buffer-size"="8" }
-attributes #5 = { noinline }
-attributes #6 = { builtin }
-
-!0 = !{i64 8632435727821051414}
-!1 = !{i64 -3421689549917153178}
-!2 = !{!3, !5}
-!3 = !{!4, !"notcold"}
-!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold"}
-!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
-!7 = !{i64 9086428284934609951}
-!8 = !{i64 -5964873800580613432}
-!9 = !{i64 2732490490862098848}
-
-
-; DUMP: CCG before cloning:
-; DUMP: Callsite Context Graph:
-; DUMP: Node [[BAR:0x[a-z0-9]+]]
-; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
-
-; DUMP: Node [[BAZ]]
-; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
-
-; DUMP: Node [[FOO]]
-; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
-
-; DUMP: Node [[MAIN1]]
-; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 1
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[MAIN2]]
-; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
-; DUMP: 	CallerEdges:
-
-
-; DOT: digraph "postbuild" {
-; DOT: 	label="postbuild";
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
-; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
-; DOT: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
-; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
-; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
-; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
-; DOT: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
deleted file mode 100644
index c5ed97f182a98..0000000000000
--- a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
+++ /dev/null
@@ -1,232 +0,0 @@
-;; Test callsite context graph generation for call graph with with MIBs
-;; that have pruned contexts that partially match multiple inlined
-;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph.
-;;
-;; Original code looks like:
-;;
-;; char *D() {
-;;   return new char[10];
-;; }
-;;
-;; char *F() {
-;;   return D();
-;; }
-;;
-;; char *C() {
-;;   return D();
-;; }
-;;
-;; char *B() {
-;;   return C();
-;; }
-;;
-;; char *E() {
-;;   return C();
-;; }
-;; int main(int argc, char **argv) {
-;;   char *x = B(); // cold
-;;   char *y = E(); // cold
-;;   char *z = F(); // default
-;;   memset(x, 0, 10);
-;;   memset(y, 0, 10);
-;;   memset(z, 0, 10);
-;;   delete[] z;
-;;   sleep(10);
-;;   delete[] x;
-;;   delete[] y;
-;;   return 0;
-;; }
-;;
-;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
-;; memory freed after sleep(10) results in cold lifetimes.
-;;
-;; The code below was created by forcing inlining of C into both B and E.
-;; Since both allocation contexts via C are cold, the matched memprof
-;; metadata has the context pruned above C's callsite. This requires
-;; matching the stack node for C to callsites where it was inlined (i.e.
-;; the callsites in B and E that have callsite metadata that includes C's).
-;; It also requires duplication of that node in the graph as well as the
-;; duplication of the context ids along that path through the graph,
-;; so that we can represent the duplicated (via inlining) C callsite.
-;;
-;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
-
-; RUN: opt -passes=memprof-context-disambiguation \
-; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
-
-; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
-; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define internal ptr @_Z1Dv() {
-entry:
-  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !5
-  ret ptr null
-}
-
-declare ptr @_Znam(i64)
-
-define internal ptr @_Z1Fv() #0 {
-entry:
-  %call = call noundef ptr @_Z1Dv(), !callsite !6
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline optnone uwtable
-define internal ptr @_Z1Cv() #1 {
-entry:
-  %call = call noundef ptr @_Z1Dv(), !callsite !7
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline optnone uwtable
-define internal ptr @_Z1Bv() #1 {
-entry:
-  %call.i = call noundef ptr @_Z1Dv(), !callsite !8
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline optnone uwtable
-define internal ptr @_Z1Ev() #1 {
-entry:
-  %call.i = call noundef ptr @_Z1Dv(), !callsite !9
-  ret ptr null
-}
-
-; Function Attrs: noinline
-declare i32 @main() #2
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3
-
-; Function Attrs: nounwind
-declare void @_ZdaPv() #4
-
-declare i32 @sleep() #5
-
-attributes #0 = { "disable-tail-calls"="true" }
-attributes #1 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #2 = { noinline }
-attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) }
-attributes #4 = { nounwind }
-attributes #5 = { "no-trapping-math"="true" }
-attributes #6 = { builtin }
-
-!0 = !{!1, !3}
-!1 = !{!2, !"cold"}
-!2 = !{i64 6541423618768552252, i64 -6270142974039008131}
-!3 = !{!4, !"notcold"}
-!4 = !{i64 6541423618768552252, i64 -4903163940066524832}
-!5 = !{i64 6541423618768552252}
-!6 = !{i64 -4903163940066524832}
-!7 = !{i64 -6270142974039008131}
-!8 = !{i64 -6270142974039008131, i64 -184525619819294889}
-!9 = !{i64 -6270142974039008131, i64 1905834578520680781}
-
-
-;; After adding only the alloc node memprof metadata, we only have 2 contexts.
-
-; DUMP: CCG before updating call stack chains:
-; DUMP: Callsite Context Graph:
-; DUMP: Node [[D:0x[a-z0-9]+]]
-; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
-; DUMP: 		Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
-
-; DUMP: Node [[C]]
-; DUMP: 	null Call
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 1
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[F]]
-; DUMP: 	null Call
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
-; DUMP: 	CallerEdges:
-
-;; After updating for callsite metadata, we should have generated context ids 3 and 4,
-;; along with 2 new nodes for those callsites. All have the same allocation type
-;; behavior as the original C node.
-
-; DUMP: CCG before cloning:
-; DUMP: Callsite Context Graph:
-; DUMP: Node [[D]]
-; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2 3 4
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
-; DUMP: 		Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3
-; DUMP: 		Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
-; DUMP: 		Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
-
-; DUMP: Node [[F]]
-; DUMP: 	  %call = call noundef ptr @_Z1Dv()	(clone 0)
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[C2]]
-; DUMP: 	  %call = call noundef ptr @_Z1Dv()	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 3
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[B]]
-; DUMP: 	  %call.i = call noundef ptr @_Z1Dv()	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 4
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[E]]
-; DUMP: 	  %call.i = call noundef ptr @_Z1Dv()	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 1
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1
-; DUMP: 	CallerEdges:
-
-
-; DOTPRE: digraph "prestackupdate" {
-; DOTPRE: 	label="prestackupdate";
-; DOTPRE: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> _Znam}"];
-; DOTPRE: 	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12176601099670543485\nnull call (external)}"];
-; DOTPRE: 	Node[[C]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
-; DOTPRE: 	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\nnull call (external)}"];
-; DOTPRE: 	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
-; DOTPRE: }
-
-
-; DOTPOST:digraph "postbuild" {
-; DOTPOST:	label="postbuild";
-; DOTPOST:	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> _Znam}"];
-; DOTPOST:	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"];
-; DOTPOST:	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
-; DOTPOST:	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"];
-; DOTPOST:	Node[[C]] -> Node[[D]][tooltip="ContextIds: 3",fillcolor="cyan"];
-; DOTPOST:	Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"];
-; DOTPOST:	Node[[B]] -> Node[[D]][tooltip="ContextIds: 4",fillcolor="cyan"];
-; DOTPOST:	Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"];
-; DOTPOST:	Node[[E]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
-; DOTPOST:}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids2.ll
deleted file mode 100644
index da0fd3f44b45e..0000000000000
--- a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids2.ll
+++ /dev/null
@@ -1,386 +0,0 @@
-;; Test callsite context graph generation for call graph with with MIBs
-;; that have pruned contexts that partially match multiple inlined
-;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph. This test requires more
-;; complex duplication due to multiple contexts for different allocations
-;; that share some of the same callsite nodes.
-;;
-;; Original code looks like:
-;;
-;; char *D(bool Call1) {
-;;   if (Call1)
-;;     return new char[10];
-;;   else
-;;     return new char[10];
-;; }
-;;
-;; char *C(bool Call1) {
-;;   return D(Call1);
-;; }
-;;
-;; char *B(bool Call1) {
-;;   if (Call1)
-;;     return C(true);
-;;   else
-;;     return C(false);
-;; }
-;;
-;; char *A(bool Call1) {
-;;   return B(Call1);
-;; }
-;;
-;; char *A1() {
-;;   return A(true);
-;; }
-;;
-;; char *A2() {
-;;   return A(true);
-;; }
-;;
-;; char *A3() {
-;;   return A(false);
-;; }
-;;
-;; char *A4() {
-;;   return A(false);
-;; }
-;;
-;; char *E() {
-;;   return B(true);
-;; }
-;;
-;; char *F() {
-;;   return B(false);
-;; }
-;;
-;; int main(int argc, char **argv) {
-;;   char *a1 = A1(); // cold
-;;   char *a2 = A2(); // cold
-;;   char *e = E(); // default
-;;   char *a3 = A3(); // default
-;;   char *a4 = A4(); // default
-;;   char *f = F(); // cold
-;;   memset(a1, 0, 10);
-;;   memset(a2, 0, 10);
-;;   memset(e, 0, 10);
-;;   memset(a3, 0, 10);
-;;   memset(a4, 0, 10);
-;;   memset(f, 0, 10);
-;;   delete[] a3;
-;;   delete[] a4;
-;;   delete[] e;
-;;   sleep(10);
-;;   delete[] a1;
-;;   delete[] a2;
-;;   delete[] f;
-;;   return 0;
-;; }
-;;
-;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
-;; memory freed after sleep(10) results in cold lifetimes.
-;;
-;; The code below was created by forcing inlining of A into its callers,
-;; without any other inlining or optimizations. Since both allocation contexts
-;; via A for each allocation in D have the same allocation type (cold via
-;; A1 and A2 for the first new in D, and non-cold via A3 and A4 for the second
-;; new in D, the contexts for those respective allocations are pruned above A.
-;; The allocations via E and F are to ensure we don't prune above B.
-;;
-;; The matching onto the inlined A[1234]->A sequences will require duplication
-;; of the context id assigned to the context from A for each allocation in D.
-;; This test ensures that we do this correctly in the presence of callsites
-;; shared by the different duplicated context ids (i.e. callsite in C).
-;;
-;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
-
-; RUN: opt -passes=memprof-context-disambiguation \
-; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
-
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: mustprogress noinline uwtable
-define ptr @_Z1Db(i1 %Call1) #0 {
-entry:
-  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !0, !callsite !5
-  br label %return
-
-if.else:                                          ; No predecessors!
-  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !6, !callsite !11
-  br label %return
-
-return:                                           ; preds = %if.else, %entry
-  ret ptr null
-}
-
-; Function Attrs: nobuiltin
-declare ptr @_Znam(i64) #1
-
-define ptr @_Z1Cb(i1 %Call1) {
-entry:
-  %tobool = trunc i8 0 to i1
-  %call = call noundef ptr @_Z1Db(i1 noundef zeroext %tobool), !callsite !12
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline uwtable
-define ptr @_Z1Bb(i1 %Call1) #0 {
-entry:
-  %call = call noundef ptr @_Z1Cb(i1 noundef zeroext true), !callsite !13
-  br label %return
-
-if.else:                                          ; No predecessors!
-  %call1 = call noundef ptr @_Z1Cb(i1 noundef zeroext false), !callsite !14
-  br label %return
-
-return:                                           ; preds = %if.else, %entry
-  ret ptr null
-}
-
-define ptr @_Z1Ab(i1 %tobool) #2 {
-entry:
-  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool), !callsite !15
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline uwtable
-define ptr @_Z2A1v(i1 %tobool.i) #0 {
-entry:
-  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !16
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline uwtable
-define ptr @_Z2A2v(i1 %tobool.i) #0 {
-entry:
-  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !17
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline uwtable
-define ptr @_Z2A3v(i1 %tobool.i) #0 {
-entry:
-  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !18
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline uwtable
-define ptr @_Z2A4v(i1 %tobool.i) #0 {
-entry:
-  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !19
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline uwtable
-define ptr @_Z1Ev() #0 {
-entry:
-  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext true), !callsite !20
-  ret ptr null
-}
-
-; Function Attrs: mustprogress noinline uwtable
-define ptr @_Z1Fv() #0 {
-entry:
-  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext false), !callsite !21
-  ret ptr null
-}
-
-; Function Attrs: noinline
-declare i32 @main() #3
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
-
-declare void @_ZdaPv() #5
-
-declare i32 @sleep() #6
-
-; uselistorder directives
-uselistorder ptr @_Znam, { 1, 0 }
-
-attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { nobuiltin }
-attributes #2 = { "tune-cpu"="generic" }
-attributes #3 = { noinline }
-attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
-attributes #5 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
-attributes #6 = { "disable-tail-calls"="true" }
-attributes #7 = { builtin allocsize(0) }
-
-!0 = !{!1, !3}
-!1 = !{!2, !"notcold"}
-!2 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 1905834578520680781}
-!3 = !{!4, !"cold"}
-!4 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 -6528110295079665978}
-!5 = !{i64 4854880825882961848}
-!6 = !{!7, !9}
-!7 = !{!8, !"notcold"}
-!8 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -6528110295079665978}
-!9 = !{!10, !"cold"}
-!10 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -4903163940066524832}
-!11 = !{i64 -8775068539491628272}
-!12 = !{i64 -904694911315397047}
-!13 = !{i64 6532298921261778285}
-!14 = !{i64 7859682663773658275}
-!15 = !{i64 -6528110295079665978}
-!16 = !{i64 -6528110295079665978, i64 5747919905719679568}
-!17 = !{i64 -6528110295079665978, i64 -5753238080028016843}
-!18 = !{i64 -6528110295079665978, i64 1794685869326395337}
-!19 = !{i64 -6528110295079665978, i64 5462047985461644151}
-!20 = !{i64 1905834578520680781}
-!21 = !{i64 -4903163940066524832}
-
-
-;; After adding only the alloc node memprof metadata, we only have 4 contexts (we only
-;; match the interesting parts of the pre-update graph here).
-
-; DUMP: CCG before updating call stack chains:
-; DUMP: Callsite Context Graph:
-
-; DUMP: Node [[D1:0x[a-z0-9]+]]
-; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-
-; DUMP: Node [[C:0x[a-z0-9]+]]
-; DUMP:         null Call
-; DUMP:         AllocTypes: NotColdCold
-; DUMP:         ContextIds: 1 2 3 4
-; DUMP:         CalleeEdges:
-; DUMP:                 Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2
-; DUMP:                 Edge from Callee [[D2:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4
-
-; DUMP: Node [[D2]]
-; DUMP: 	  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 3 4
-
-
-;; After updating for callsite metadata, we should have duplicated the context
-;; ids coming from node A (2 and 3) 4 times, for the 4 different callers of A,
-;; and used those on new nodes for those callers. Note that while in reality
-;; we only have cold edges coming from A1 and A2 and noncold from A3 and A4,
-;; due to the pruning we have lost this information and thus end up duplicating
-;; both of A's contexts to all of the new nodes (which could result in some
-;; unnecessary cloning.
-
-; DUMP: CCG before cloning:
-; DUMP: Callsite Context Graph:
-; DUMP: Node [[D1]]
-; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2 5 7 9 11
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
-
-; DUMP: Node [[C]]
-; DUMP: 	  %call = call noundef ptr @_Z1Db(i1 noundef zeroext %tobool)	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2 3 4 5 6 7 8 9 10 11 12
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
-; DUMP: 		Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[C]] to Caller: [[B1:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
-; DUMP: 		Edge from Callee [[C]] to Caller: [[B2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
-
-; DUMP: Node [[B1]]
-; DUMP: 	  %call = call noundef ptr @_Z1Cb(i1 noundef zeroext true)	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2 5 7 9 11
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[C]] to Caller: [[B1]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 5
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A3:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 7
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A1:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 9
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 11
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
-
-; DUMP: Node [[E]]
-; DUMP: 	  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext true)	(clone 0)
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 1
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[E]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[D2]]
-; DUMP: 	  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 3 4 6 8 10 12
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
-
-; DUMP: Node [[B2]]
-; DUMP: 	  %call1 = call noundef ptr @_Z1Cb(i1 noundef zeroext false)	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 3 4 6 8 10 12
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[C]] to Caller: [[B2]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
-
-; DUMP: Node [[F]]
-; DUMP: 	  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext false)	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 4
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[F]] AllocTypes: Cold ContextIds: 4
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[A2]]
-; DUMP: 	  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool)	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 5 6
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A2]] AllocTypes: Cold ContextIds: 5
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[A3]]
-; DUMP: 	  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i)	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 7 8
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A3]] AllocTypes: Cold ContextIds: 7
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[A1]]
-; DUMP: 	  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i)	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 9 10
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A1]] AllocTypes: Cold ContextIds: 9
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[A4]]
-; DUMP: 	  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i)	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 11 12
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A4]] AllocTypes: Cold ContextIds: 11
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[A]]
-; DUMP: 	  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i)	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 2 3
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[B1]] to Caller: [[A]] AllocTypes: Cold ContextIds: 2
-; DUMP: 		Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
-; DUMP: 	CallerEdges:
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
deleted file mode 100644
index 9ebf219dd37a0..0000000000000
--- a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
+++ /dev/null
@@ -1,261 +0,0 @@
-;; Tests callsite context graph generation for call graph containing indirect
-;; calls. Currently this should result in conservative behavior, such that the
-;; indirect call receives a null call in its graph node, to prevent subsequent
-;; cloning.
-;;
-;; Original code looks like:
-;;
-;; char *foo() {
-;;   return new char[10];
-;; }
-;; class A {
-;; public:
-;;     virtual char *x() { return foo(); }
-;; };
-;; class B : public A {
-;; public:
-;;     char *x() final { return foo(); }
-;; };
-;; char *bar(A *a) {
-;;   return a->x();
-;; }
-;; int main(int argc, char **argv) {
-;;   char *x = foo();
-;;   char *y = foo();
-;;   B b;
-;;   char *z = bar(&b);
-;;   char *w = bar(&b);
-;;   A a;
-;;   char *r = bar(&a);
-;;   char *s = bar(&a);
-;;   memset(x, 0, 10);
-;;   memset(y, 0, 10);
-;;   memset(z, 0, 10);
-;;   memset(w, 0, 10);
-;;   memset(r, 0, 10);
-;;   memset(s, 0, 10);
-;;   delete[] x;
-;;   delete[] w;
-;;   delete[] r;
-;;   sleep(10);
-;;   delete[] y;
-;;   delete[] z;
-;;   delete[] s;
-;;   return 0;
-;; }
-;;
-;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
-;; memory freed after sleep(10) results in cold lifetimes.
-;;
-;; Compiled without optimization to prevent inlining and devirtualization.
-;;
-;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
-
-; RUN: opt -passes=memprof-context-disambiguation \
-; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
-
-; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
-
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare ptr @_Z3barP1A(ptr)
-
-define i32 @main(ptr %b, ptr %a) #0 {
-entry:
-  %call = call noundef ptr @_Z3foov(), !callsite !0
-  %call1 = call noundef ptr @_Z3foov(), !callsite !1
-  %call2 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !2
-  %call3 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !3
-  %call4 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !4
-  %call5 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !5
-  ret i32 0
-}
-
-; Function Attrs: noinline
-declare void @_ZN1BC2Ev() #1
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
-
-; Function Attrs: nobuiltin
-declare void @_ZdaPv() #3
-
-define internal ptr @_ZN1A1xEv() #4 {
-entry:
-  %call = call noundef ptr @_Z3foov(), !callsite !6
-  ret ptr null
-}
-
-; Function Attrs: mustprogress uwtable
-define internal ptr @_ZN1B1xEv() #5 {
-entry:
-  %call = call noundef ptr @_Z3foov(), !callsite !7
-  ret ptr null
-}
-
-; Function Attrs: mustprogress uwtable
-define internal ptr @_Z3foov() #5 {
-entry:
-  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !8, !callsite !21
-  ret ptr null
-}
-
-declare ptr @_Znam(i64) #6
-
-; uselistorder directives
-uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
-
-attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
-attributes #1 = { noinline }
-attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
-attributes #3 = { nobuiltin }
-attributes #4 = { "tune-cpu"="generic" }
-attributes #5 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #6 = { "disable-tail-calls"="true" }
-attributes #7 = { builtin }
-
-!0 = !{i64 8632435727821051414}
-!1 = !{i64 -3421689549917153178}
-!2 = !{i64 6792096022461663180}
-!3 = !{i64 -2709642582978494015}
-!4 = !{i64 748269490701775343}
-!5 = !{i64 -5747251260480066785}
-!6 = !{i64 8256774051149711748}
-!7 = !{i64 -4831879094954754638}
-!8 = !{!9, !11, !13, !15, !17, !19}
-!9 = !{!10, !"notcold"}
-!10 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343}
-!11 = !{!12, !"cold"}
-!12 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785}
-!13 = !{!14, !"notcold"}
-!14 = !{i64 2732490490862098848, i64 8632435727821051414}
-!15 = !{!16, !"cold"}
-!16 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180}
-!17 = !{!18, !"notcold"}
-!18 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015}
-!19 = !{!20, !"cold"}
-!20 = !{i64 2732490490862098848, i64 -3421689549917153178}
-!21 = !{i64 2732490490862098848}
-
-
-; DUMP: CCG before cloning:
-; DUMP: Callsite Context Graph:
-; DUMP: Node [[FOO:0x[a-z0-9]+]]
-; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2 3 4 5 6
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6
-
-; DUMP: Node [[AX]]
-; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
-
-;; Bar contains an indirect call, with multiple targets. It's call should be null.
-; DUMP: Node [[BAR]]
-; DUMP: 	null Call
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2 4 5
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2
-; DUMP: 		Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5
-
-; DUMP: Node [[MAIN3]]
-; DUMP: 	  %call4 = call noundef ptr @_Z3barP1A(ptr noundef %a)	(clone 0)
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 1
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[MAIN4]]
-; DUMP: 	  %call5 = call noundef ptr @_Z3barP1A(ptr noundef %a)	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[MAIN1]]
-; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 3
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[BX]]
-; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 4 5
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
-
-; DUMP: Node [[MAIN5]]
-; DUMP: 	  %call2 = call noundef ptr @_Z3barP1A(ptr noundef %b)	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 4
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[MAIN6]]
-; DUMP: 	  %call3 = call noundef ptr @_Z3barP1A(ptr noundef %b)	(clone 0)
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 5
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[MAIN2]]
-; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 6
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6
-; DUMP: 	CallerEdges:
-
-
-; DOT: digraph "postbuild" {
-; DOT: 	label="postbuild";
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"];
-; DOT: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
-; DOT: 	Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
-; DOT: 	Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
-; DOT: 	Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
-; DOT: 	Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"];
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
-; DOT: 	Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"];
-; DOT: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
-; DOT: 	Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"];
-; DOT: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
-; DOT: 	Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
-; DOT: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
-; DOT: 	Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"];
-; DOT: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
-; DOT: 	Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"];
-; DOT: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
-; DOT: 	Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"];
-; DOT: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
deleted file mode 100644
index 59f135ca06627..0000000000000
--- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
+++ /dev/null
@@ -1,189 +0,0 @@
-;; Test callsite context graph generation for call graph with two memprof
-;; contexts and partial inlining, requiring generation of a new fused node to
-;; represent the inlined sequence while matching callsite nodes onto the graph.
-;;
-;; Original code looks like:
-;;
-;; char *bar() {
-;;   return new char[10];
-;; }
-;;
-;; char *baz() {
-;;   return bar();
-;; }
-;;
-;; char *foo() {
-;;   return baz();
-;; }
-;;
-;; int main(int argc, char **argv) {
-;;   char *x = foo();
-;;   char *y = foo();
-;;   memset(x, 0, 10);
-;;   memset(y, 0, 10);
-;;   delete[] x;
-;;   sleep(10);
-;;   delete[] y;
-;;   return 0;
-;; }
-;;
-;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
-;; memory freed after sleep(10) results in cold lifetimes.
-;;
-;; The code below was created by forcing inlining of baz into foo, and
-;; bar into baz. Due to the inlining of bar we will initially have two
-;; allocation nodes in the graph. This tests that we correctly match
-;; foo (with baz inlined) onto the graph nodes first, and generate a new
-;; fused node for it. We should then not match baz (with bar inlined) as that
-;; is not reached by the MIB contexts (since all calls from main will look
-;; like main -> foo(+baz) -> bar after the inlining reflected in this IR).
-;;
-;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
-
-; RUN: opt -passes=memprof-context-disambiguation \
-; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
-
-; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
-
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define internal ptr @_Z3barv() {
-entry:
-  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !0, !callsite !5
-  ret ptr null
-}
-
-; Function Attrs: nobuiltin
-declare ptr @_Znam(i64) #0
-
-; Function Attrs: mustprogress
-define internal ptr @_Z3bazv() #1 {
-entry:
-  %call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !0, !callsite !6
-  ret ptr null
-}
-
-; Function Attrs: noinline
-define internal ptr @_Z3foov() #2 {
-entry:
-  %call.i = call noundef ptr @_Z3barv(), !callsite !7
-  ret ptr null
-}
-
-define i32 @main() #3 {
-entry:
-  %call = call noundef ptr @_Z3foov(), !callsite !8
-  %call1 = call noundef ptr @_Z3foov(), !callsite !9
-  ret i32 0
-}
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
-
-; Function Attrs: nounwind
-declare void @_ZdaPv() #5
-
-declare i32 @sleep() #6
-
-attributes #0 = { nobuiltin }
-attributes #1 = { mustprogress }
-attributes #2 = { noinline }
-attributes #3 = { "tune-cpu"="generic" }
-attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
-attributes #5 = { nounwind }
-attributes #6 = { "disable-tail-calls"="true" }
-attributes #7 = { builtin }
-
-!0 = !{!1, !3}
-!1 = !{!2, !"notcold"}
-!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!3 = !{!4, !"cold"}
-!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
-!5 = !{i64 9086428284934609951}
-!6 = !{i64 9086428284934609951, i64 -5964873800580613432}
-!7 = !{i64 -5964873800580613432, i64 2732490490862098848}
-!8 = !{i64 8632435727821051414}
-!9 = !{i64 -3421689549917153178}
-
-
-; DUMP: CCG before cloning:
-; DUMP: Callsite Context Graph:
-; DUMP: Node [[BAR:0x[a-z0-9]+]]
-; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
-
-;; This is leftover from the MIB on the alloc inlined into baz. It is not
-;; matched with any call, since there is no such node in the IR. Due to the
-;; null call it will not participate in any context transformations.
-; DUMP: Node [[FOO2:0x[a-z0-9]+]]
-; DUMP: 	null Call
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 3 4
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAZ:0x[a-z0-9]+]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 3 4
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
-; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
-
-; DUMP: Node [[MAIN1]]
-; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 1 3
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[MAIN2]]
-; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 2 4
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
-; DUMP: 	CallerEdges:
-
-; DUMP: Node [[BAZ]]
-; DUMP: 	  %call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 3 4
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 3 4
-
-;; This is the node synthesized for the call to bar in foo that was created
-;; by inlining baz into foo.
-; DUMP: Node [[FOO]]
-; DUMP: 	  %call.i = call noundef ptr @_Z3barv()	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
-
-
-; DOT: digraph "postbuild" {
-; DOT: 	label="postbuild";
-; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
-; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
-; DOT: 	Node[[FOO]] -> Node[[BAZ:0x[a-z0-9]+]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"];
-; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
-; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"];
-; DOT: 	Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 1",fillcolor="brown1"];
-; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
-; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 4",fillcolor="cyan"];
-; DOT: 	Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 2",fillcolor="cyan"];
-; DOT: 	Node[[BAZ]] [shape=record,tooltip="N[[BAZ]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc2\n_Z3bazv -\> _Znam}"];
-; DOT: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
-; DOT: 	Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
-; DOT: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll
deleted file mode 100644
index a3a056ade8c49..0000000000000
--- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-;; Test callsite context graph generation for call graph with two memprof
-;; contexts and multiple levels of inlining, requiring generation of new
-;; fused nodes to represent the inlined sequence while matching callsite
-;; nodes onto the graph. In particular this tests the case where a function
-;; has inlined a callee containing an inlined callee.
-;;
-;; Original code looks like:
-;;
-;; char *bar() __attribute__((noinline)) {
-;;   return new char[10];
-;; }
-;;
-;; char *baz() {
-;;   return bar();
-;; }
-;;
-;; char *foo() {
-;;   return baz();
-;; }
-;;
-;; int main(int argc, char **argv) {
-;;   char *x = foo();
-;;   char *y = foo();
-;;   memset(x, 0, 10);
-;;   memset(y, 0, 10);
-;;   delete[] x;
-;;   sleep(10);
-;;   delete[] y;
-;;   return 0;
-;; }
-;;
-;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
-;; memory freed after sleep(10) results in cold lifetimes.
-;;
-;; Both foo and baz are inlined into main, at both foo callsites.
-;; We should update the graph for new fused nodes for both of those inlined
-;; callsites to bar.
-;;
-;; Note that baz and bar are both dead due to the inlining, but have been left
-;; in the input IR to ensure that the MIB call chain is matched to the longer
-;; inline sequences from main.
-;;
-;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
-
-; RUN: opt -passes=memprof-context-disambiguation \
-; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
-
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define ptr @_Z3barv() #0 {
-entry:
-  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12, !heapallocsite !13
-  ret ptr null
-}
-
-; Function Attrs: nobuiltin
-declare ptr @_Znam(i64) #1
-
-; Function Attrs: mustprogress
-declare ptr @_Z3bazv() #2
-
-define i32 @main() #3 {
-delete.end5:
-  %call.i.i = call noundef ptr @_Z3barv(), !callsite !14
-  %call.i.i8 = call noundef ptr @_Z3barv(), !callsite !15
-  ret i32 0
-}
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
-
-declare void @_ZdaPv() #5
-
-declare i32 @sleep() #6
-
-attributes #0 = { "stack-protector-buffer-size"="8" }
-attributes #1 = { nobuiltin }
-attributes #2 = { mustprogress }
-attributes #3 = { "tune-cpu"="generic" }
-attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
-attributes #5 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
-attributes #6 = { "disable-tail-calls"="true" }
-attributes #7 = { builtin }
-
-!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6}
-
-!0 = !{i32 7, !"Dwarf Version", i32 5}
-!1 = !{i32 2, !"Debug Info Version", i32 3}
-!2 = !{i32 1, !"wchar_size", i32 4}
-!3 = !{i32 8, !"PIC Level", i32 2}
-!4 = !{i32 7, !"PIE Level", i32 2}
-!5 = !{i32 7, !"uwtable", i32 2}
-!6 = !{i32 7, !"frame-pointer", i32 2}
-!7 = !{!8, !10}
-!8 = !{!9, !"notcold"}
-!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!10 = !{!11, !"cold"}
-!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
-!12 = !{i64 9086428284934609951}
-!13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
-!14 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!15 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
-
-
-; DUMP: CCG before cloning:
-; DUMP: Callsite Context Graph:
-; DUMP: Node [[BAR:0x[a-z0-9]+]]
-; DUMP: 	  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #7, !heapallocsite !7	(clone 0)
-; DUMP: 	AllocTypes: NotColdCold
-; DUMP: 	ContextIds: 1 2
-; DUMP: 	CalleeEdges:
-; DUMP: 	CallerEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
-
-;; This is the node synthesized for the first inlined call chain of main->foo->baz
-; DUMP: Node [[MAIN1]]
-; DUMP: 	  %call.i.i = call noundef ptr @_Z3barv()	(clone 0)
-; DUMP: 	AllocTypes: NotCold
-; DUMP: 	ContextIds: 1
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
-; DUMP: 	CallerEdges:
-
-;; This is the node synthesized for the second inlined call chain of main->foo->baz
-; DUMP: Node [[MAIN2]]
-; DUMP: 	  %call.i.i8 = call noundef ptr @_Z3barv()	(clone 0)
-; DUMP: 	AllocTypes: Cold
-; DUMP: 	ContextIds: 2
-; DUMP: 	CalleeEdges:
-; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
-; DUMP: 	CallerEdges:
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/pass-pipeline.ll b/llvm/test/Transforms/MemProfContextDisambiguation/pass-pipeline.ll
deleted file mode 100644
index fede5fe96eccd..0000000000000
--- a/llvm/test/Transforms/MemProfContextDisambiguation/pass-pipeline.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-;; Test that MemProfContextDisambiguation is enabled under the expected conditions
-;; and in the expected position.
-
-;; Pass is not currently enabled by default at any opt level.
-; RUN: opt -debug-pass-manager -passes='lto<O0>' -S %s \
-; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
-; RUN: opt -debug-pass-manager -passes='lto<O1>' -S %s \
-; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
-; RUN: opt -debug-pass-manager -passes='lto<O2>' -S %s \
-; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
-; RUN: opt -debug-pass-manager -passes='lto<O3>' -S %s \
-; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
-
-;; Pass should not run even under option at O0/O1.
-; RUN: opt -debug-pass-manager -passes='lto<O0>' -S %s \
-; RUN:     -enable-memprof-context-disambiguation \
-; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
-; RUN: opt -debug-pass-manager -passes='lto<O1>' -S %s \
-; RUN:     -enable-memprof-context-disambiguation \
-; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
-
-;; Pass should be enabled under option at O2/O3.
-; RUN: opt -debug-pass-manager -passes='lto<O2>' -S %s \
-; RUN:     -enable-memprof-context-disambiguation \
-; RUN:     2>&1 | FileCheck %s --check-prefix=ENABLED
-; RUN: opt -debug-pass-manager -passes='lto<O3>' -S %s \
-; RUN:     -enable-memprof-context-disambiguation \
-; RUN:     2>&1 | FileCheck %s --check-prefix=ENABLED
-
-;; When enabled, MemProfContextDisambiguation runs just after inlining.
-; ENABLED: Running pass: InlinerPass
-; ENABLED: Invalidating analysis: InlineAdvisorAnalysis
-; ENABLED: Running pass: MemProfContextDisambiguation
-
-define noundef ptr @_Z3barv() {
-entry:
-  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10)
-  ret ptr %call
-}
-
-declare noundef nonnull ptr @_Znam(i64 noundef)

From 553bff0e9c571c4b53520126e8c0f6fe2ed966a0 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 22 Mar 2023 14:45:00 +0000
Subject: [PATCH 002/208] [gn build] Port 883dbb9c86be

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
index 0dbeb793e40eb..644d30f10854e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
@@ -48,7 +48,6 @@ static_library("IPO") {
     "Internalize.cpp",
     "LoopExtractor.cpp",
     "LowerTypeTests.cpp",
-    "MemProfContextDisambiguation.cpp",
     "MergeFunctions.cpp",
     "ModuleInliner.cpp",
     "OpenMPOpt.cpp",

From 65a0d669b4625c34775436a6d3643d15bbc2465a Mon Sep 17 00:00:00 2001
From: Doru Bercea <doru.bercea@amd.com>
Date: Wed, 22 Feb 2023 11:58:48 -0500
Subject: [PATCH 003/208] Fix accessing of aligned arrays in offloaded target
 regions

---
 clang/lib/Sema/SemaOpenMP.cpp                 |     8 +-
 .../amdgpu_target_with_aligned_attribute.c    |   305 +
 .../OpenMP/parallel_firstprivate_codegen.cpp  |   328 +-
 ...l_master_taskloop_firstprivate_codegen.cpp |  2562 +++-
 ...ter_taskloop_simd_firstprivate_codegen.cpp |  2664 +++-
 .../OpenMP/target_firstprivate_codegen.cpp    | 12203 +++++++++++++++-
 .../OpenMP/target_is_device_ptr_codegen.cpp   |  5825 +++++++-
 .../OpenMP/teams_firstprivate_codegen.cpp     |   752 +-
 8 files changed, 22804 insertions(+), 1843 deletions(-)
 create mode 100644 clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c

diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index e193fa3d19d5c..1cd263b8a5b1c 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2273,10 +2273,10 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
   // and alignment, because the runtime library only deals with uintptr types.
   // If it does not fit the uintptr size, we need to pass the data by reference
   // instead.
-  if (!IsByRef &&
-      (Ctx.getTypeSizeInChars(Ty) >
-           Ctx.getTypeSizeInChars(Ctx.getUIntPtrType()) ||
-       Ctx.getDeclAlign(D) > Ctx.getTypeAlignInChars(Ctx.getUIntPtrType()))) {
+  if (!IsByRef && (Ctx.getTypeSizeInChars(Ty) >
+                       Ctx.getTypeSizeInChars(Ctx.getUIntPtrType()) ||
+                   Ctx.getAlignOfGlobalVarInChars(Ty) >
+                       Ctx.getTypeAlignInChars(Ctx.getUIntPtrType()))) {
     IsByRef = true;
   }
 
diff --git a/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c b/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
new file mode 100644
index 0000000000000..e33ad0b353f51
--- /dev/null
+++ b/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
@@ -0,0 +1,305 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host-amd.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host-amd.bc -o - | FileCheck %s --check-prefix=CHECK-AMD
+
+
+void write_to_aligned_array(int *a, int N) {
+  int *aptr __attribute__ ((aligned(64))) = a;
+  #pragma omp target teams distribute parallel for map(tofrom: aptr[0:N])
+  for(int i = 0; i < N; i++) {
+    aptr[i] = i;
+  }
+}
+
+#endif
+// CHECK-AMD-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14
+// CHECK-AMD-SAME: (i64 noundef [[N:%.*]], ptr noundef [[APTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-AMD-NEXT:  entry:
+// CHECK-AMD-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[APTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[APTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[APTR_ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
+// CHECK-AMD-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-AMD-NEXT:    store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store ptr [[APTR]], ptr [[APTR_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false)
+// CHECK-AMD-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-AMD-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-AMD:       user_code.entry:
+// CHECK-AMD-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// CHECK-AMD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[APTR_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// CHECK-AMD-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]]
+// CHECK-AMD-NEXT:    call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
+// CHECK-AMD-NEXT:    ret void
+// CHECK-AMD:       worker.exit:
+// CHECK-AMD-NEXT:    ret void
+//
+//
+// CHECK-AMD-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-AMD-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef [[APTR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-AMD-NEXT:  entry:
+// CHECK-AMD-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[APTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[I3:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[APTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[APTR_ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
+// CHECK-AMD-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+// CHECK-AMD-NEXT:    [[DOTCAPTURE_EXPR__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCAPTURE_EXPR_]] to ptr
+// CHECK-AMD-NEXT:    [[DOTCAPTURE_EXPR_1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCAPTURE_EXPR_1]] to ptr
+// CHECK-AMD-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
+// CHECK-AMD-NEXT:    [[I3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I3]] to ptr
+// CHECK-AMD-NEXT:    [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
+// CHECK-AMD-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
+// CHECK-AMD-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store ptr [[APTR]], ptr [[APTR_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR__ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK-AMD-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-AMD-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-AMD-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 0, ptr [[I_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK-AMD-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-AMD:       omp.precond.then:
+// CHECK-AMD-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-AMD-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK-AMD-NEXT:    call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP5]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-AMD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK-AMD-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-AMD:       cond.true:
+// CHECK-AMD-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    br label [[COND_END:%.*]]
+// CHECK-AMD:       cond.false:
+// CHECK-AMD-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    br label [[COND_END]]
+// CHECK-AMD:       cond.end:
+// CHECK-AMD-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK-AMD-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-AMD:       omp.inner.for.cond:
+// CHECK-AMD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK-AMD-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP11]], [[ADD]]
+// CHECK-AMD-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-AMD:       omp.inner.for.body:
+// CHECK-AMD-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK-AMD-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK-AMD-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP17]], ptr [[N_CASTED_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP18:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[APTR_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
+// CHECK-AMD-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-AMD-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 8
+// CHECK-AMD-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
+// CHECK-AMD-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP16]] to ptr
+// CHECK-AMD-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8
+// CHECK-AMD-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
+// CHECK-AMD-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP18]] to ptr
+// CHECK-AMD-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8
+// CHECK-AMD-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
+// CHECK-AMD-NEXT:    store ptr [[TMP19]], ptr [[TMP26]], align 8
+// CHECK-AMD-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK-AMD-NEXT:    call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__.1, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
+// CHECK-AMD-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-AMD:       omp.inner.for.inc:
+// CHECK-AMD-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
+// CHECK-AMD-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK-AMD-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK-AMD-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+// CHECK-AMD-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK-AMD:       cond.true10:
+// CHECK-AMD-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    br label [[COND_END12:%.*]]
+// CHECK-AMD:       cond.false11:
+// CHECK-AMD-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    br label [[COND_END12]]
+// CHECK-AMD:       cond.end12:
+// CHECK-AMD-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ]
+// CHECK-AMD-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP39]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-AMD:       omp.inner.for.end:
+// CHECK-AMD-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-AMD:       omp.loop.exit:
+// CHECK-AMD-NEXT:    [[TMP40:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
+// CHECK-AMD-NEXT:    call void @__kmpc_distribute_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP41]])
+// CHECK-AMD-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK-AMD:       omp.precond.end:
+// CHECK-AMD-NEXT:    ret void
+//
+//
+// CHECK-AMD-LABEL: define {{[^@]+}}@__omp_outlined__.1
+// CHECK-AMD-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef [[APTR:%.*]]) #[[ATTR1]] {
+// CHECK-AMD-NEXT:  entry:
+// CHECK-AMD-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[APTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[I4:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[APTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[APTR_ADDR]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
+// CHECK-AMD-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+// CHECK-AMD-NEXT:    [[DOTCAPTURE_EXPR__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCAPTURE_EXPR_]] to ptr
+// CHECK-AMD-NEXT:    [[DOTCAPTURE_EXPR_1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCAPTURE_EXPR_1]] to ptr
+// CHECK-AMD-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
+// CHECK-AMD-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
+// CHECK-AMD-NEXT:    [[I4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I4]] to ptr
+// CHECK-AMD-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    store ptr [[APTR]], ptr [[APTR_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR__ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK-AMD-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-AMD-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-AMD-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 0, ptr [[I_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK-AMD-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-AMD:       omp.precond.then:
+// CHECK-AMD-NEXT:    store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
+// CHECK-AMD-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK-AMD-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[CONV3]], ptr [[DOTOMP_UB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK-AMD-NEXT:    call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP7]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
+// CHECK-AMD-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
+// CHECK-AMD-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-AMD:       omp.inner.for.cond:
+// CHECK-AMD-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK-AMD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[CMP6:%.*]] = icmp ule i64 [[CONV5]], [[TMP10]]
+// CHECK-AMD-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-AMD:       omp.inner.for.body:
+// CHECK-AMD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
+// CHECK-AMD-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-AMD-NEXT:    store i32 [[ADD]], ptr [[I4_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I4_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[APTR_ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I4_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK-AMD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK-AMD-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX]], align 4
+// CHECK-AMD-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-AMD:       omp.body.continue:
+// CHECK-AMD-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-AMD:       omp.inner.for.inc:
+// CHECK-AMD-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-AMD-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-AMD-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-AMD-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-AMD:       omp.inner.for.end:
+// CHECK-AMD-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-AMD:       omp.loop.exit:
+// CHECK-AMD-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-AMD-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK-AMD-NEXT:    call void @__kmpc_distribute_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP18]])
+// CHECK-AMD-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK-AMD:       omp.precond.end:
+// CHECK-AMD-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
index 2c70e05feafd0..845888dd32d19 100644
--- a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -467,6 +467,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 128
 // CHECK1-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 128
 // CHECK1-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 128
+// CHECK1-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[T_VAR_CASTED1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
@@ -476,23 +478,29 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 3)
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 4, ptr @.omp_outlined..3, ptr [[VEC]], ptr [[T_VAR]], ptr [[S_ARR]], ptr [[VAR]])
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @.omp_outlined..4, ptr [[T_VAR]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[T_VAR_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 4, ptr @.omp_outlined..3, ptr [[VEC]], i32 [[TMP1]], ptr [[S_ARR]], ptr [[VAR]])
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[T_VAR]], align 128
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[T_VAR_CASTED1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_CASTED1]], align 4
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @.omp_outlined..4, i32 [[TMP3]])
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
 // CHECK1-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK1:       arraydestroy.body:
-// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP0]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP4]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
 // CHECK1-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
-// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
-// CHECK1:       arraydestroy.done1:
+// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1:       arraydestroy.done2:
 // CHECK1-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK1-NEXT:    ret i32 [[TMP1]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK1-NEXT:    ret i32 [[TMP5]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN2SSC2ERi
@@ -713,67 +721,63 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_outlined..3
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK1-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 4
-// CHECK1-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK1-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK1-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK1-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
-// CHECK1-NEXT:    [[VEC2:%.*]] = alloca [2 x i32], align 128
-// CHECK1-NEXT:    [[S_ARR3:%.*]] = alloca [2 x %struct.S.0], align 128
+// CHECK1-NEXT:    [[VEC1:%.*]] = alloca [2 x i32], align 128
+// CHECK1-NEXT:    [[S_ARR2:%.*]] = alloca [2 x %struct.S.0], align 128
 // CHECK1-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
-// CHECK1-NEXT:    [[VAR5:%.*]] = alloca [[STRUCT_S_0:%.*]], align 128
-// CHECK1-NEXT:    [[AGG_TMP6:%.*]] = alloca [[STRUCT_ST]], align 4
+// CHECK1-NEXT:    [[VAR4:%.*]] = alloca [[STRUCT_S_0:%.*]], align 128
+// CHECK1-NEXT:    [[AGG_TMP5:%.*]] = alloca [[STRUCT_ST]], align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
-// CHECK1-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 128
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[T_VAR1]], align 128
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC2]], ptr align 128 [[TMP0]], i32 8, i1 false)
-// CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
-// CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i32 8, i1 false)
+// CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE3:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK1:       omp.arraycpy.body:
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]])
 // CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr [[AGG_TMP]])
 // CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
-// CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
-// CHECK1:       omp.arraycpy.done4:
-// CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
-// CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR5]], ptr nonnull align 4 dereferenceable(4) [[TMP3]], ptr [[AGG_TMP6]])
-// CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[T_VAR1]], align 128
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX]], align 128
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[ARRAYIDX7]], ptr align 128 [[VAR5]], i32 4, i1 false)
-// CHECK1-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i32 2
+// CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK1:       omp.arraycpy.done3:
+// CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]])
+// CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR4]], ptr nonnull align 4 dereferenceable(4) [[TMP2]], ptr [[AGG_TMP5]])
+// CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC1]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX]], align 128
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[ARRAYIDX6]], ptr align 128 [[VAR4]], i32 4, i1 false)
+// CHECK1-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN7]], i32 2
 // CHECK1-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK1:       arraydestroy.body:
-// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[OMP_ARRAYCPY_DONE4]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[OMP_ARRAYCPY_DONE3]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
 // CHECK1-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
-// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
-// CHECK1:       arraydestroy.done9:
+// CHECK1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
+// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1:       arraydestroy.done8:
 // CHECK1-NEXT:    ret void
 //
 //
@@ -803,18 +807,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_outlined..4
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[T_VAR:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
-// CHECK1-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK1-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
+// CHECK1-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
-// CHECK1-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[T_VAR1]], align 128
+// CHECK1-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1123,27 +1123,23 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_outlined..2
-// CHECK3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
-// CHECK3-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[G_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[G1:%.*]] = alloca i32, align 128
 // CHECK3-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 4
 // CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
-// CHECK3-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[G]], ptr [[G_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr [[TMP0]], align 128
-// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[G1]], align 128
-// CHECK3-NEXT:    store i32 1, ptr [[G1]], align 128
+// CHECK3-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
 // CHECK3-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[REF_TMP]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr [[G1]], ptr [[TMP2]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK3-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP3]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[G_ADDR]], ptr [[TMP0]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[REF_TMP]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP1]], align 4
 // CHECK3-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr nonnull align 4 dereferenceable(8) [[REF_TMP]])
 // CHECK3-NEXT:    ret void
 //
@@ -1193,33 +1189,33 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK4-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT:    [[G_CASTED:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 4
 // CHECK4-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR]], align 4
-// CHECK4-NEXT:    [[TMP0:%.*]] = load i32, ptr @_ZZ4mainE5sivar, align 4
-// CHECK4-NEXT:    store i32 [[TMP0]], ptr [[SIVAR_CASTED]], align 4
-// CHECK4-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIVAR_CASTED]], align 4
-// CHECK4-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @.omp_outlined., ptr @g, i32 [[TMP1]])
+// CHECK4-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @g, align 128
+// CHECK4-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
+// CHECK4-NEXT:    [[TMP1:%.*]] = load i32, ptr [[G_CASTED]], align 4
+// CHECK4-NEXT:    [[TMP2:%.*]] = load i32, ptr @_ZZ4mainE5sivar, align 4
+// CHECK4-NEXT:    store i32 [[TMP2]], ptr [[SIVAR_CASTED]], align 4
+// CHECK4-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SIVAR_CASTED]], align 4
+// CHECK4-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @.omp_outlined., i32 [[TMP1]], i32 [[TMP3]])
 // CHECK4-NEXT:    ret void
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@.omp_outlined.
-// CHECK4-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK4-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
-// CHECK4-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT:    [[G_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[G1:%.*]] = alloca i32, align 128
 // CHECK4-NEXT:    [[BLOCK:%.*]] = alloca <{ ptr, i32, i32, ptr, ptr, i32, [104 x i8], i32 }>, align 128
 // CHECK4-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK4-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
-// CHECK4-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK4-NEXT:    store i32 [[G]], ptr [[G_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
-// CHECK4-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr [[TMP0]], align 128
-// CHECK4-NEXT:    store i32 [[TMP1]], ptr [[G1]], align 128
-// CHECK4-NEXT:    store i32 1, ptr [[G1]], align 128
+// CHECK4-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
 // CHECK4-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
 // CHECK4-NEXT:    [[BLOCK_ISA:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [104 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 0
 // CHECK4-NEXT:    store ptr @_NSConcreteStackBlock, ptr [[BLOCK_ISA]], align 128
@@ -1232,14 +1228,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    [[BLOCK_DESCRIPTOR:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [104 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 4
 // CHECK4-NEXT:    store ptr @__block_descriptor_tmp, ptr [[BLOCK_DESCRIPTOR]], align 16
 // CHECK4-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [104 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 7
-// CHECK4-NEXT:    [[TMP2:%.*]] = load volatile i32, ptr [[G1]], align 128
-// CHECK4-NEXT:    store volatile i32 [[TMP2]], ptr [[BLOCK_CAPTURED]], align 128
-// CHECK4-NEXT:    [[BLOCK_CAPTURED2:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [104 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 5
-// CHECK4-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SIVAR_ADDR]], align 4
-// CHECK4-NEXT:    store i32 [[TMP3]], ptr [[BLOCK_CAPTURED2]], align 4
-// CHECK4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], ptr [[BLOCK]], i32 0, i32 3
-// CHECK4-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
-// CHECK4-NEXT:    call void [[TMP5]](ptr [[BLOCK]])
+// CHECK4-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr [[G_ADDR]], align 4
+// CHECK4-NEXT:    store volatile i32 [[TMP0]], ptr [[BLOCK_CAPTURED]], align 128
+// CHECK4-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [104 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 5
+// CHECK4-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIVAR_ADDR]], align 4
+// CHECK4-NEXT:    store i32 [[TMP1]], ptr [[BLOCK_CAPTURED1]], align 4
+// CHECK4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], ptr [[BLOCK]], i32 0, i32 3
+// CHECK4-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
+// CHECK4-NEXT:    call void [[TMP3]](ptr [[BLOCK]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -1675,6 +1671,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 128
 // CHECK9-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 128
 // CHECK9-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 128
+// CHECK9-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[T_VAR_CASTED1:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 128
@@ -1684,23 +1682,29 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 2)
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 3)
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 4, ptr @.omp_outlined..3, ptr [[VEC]], ptr [[T_VAR]], ptr [[S_ARR]], ptr [[VAR]])
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @.omp_outlined..4, ptr [[T_VAR]])
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
+// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[T_VAR_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 4, ptr @.omp_outlined..3, ptr [[VEC]], i64 [[TMP1]], ptr [[S_ARR]], ptr [[VAR]])
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[T_VAR]], align 128
+// CHECK9-NEXT:    store i32 [[TMP2]], ptr [[T_VAR_CASTED1]], align 4
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i64, ptr [[T_VAR_CASTED1]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @.omp_outlined..4, i64 [[TMP3]])
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
-// CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP0]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP4]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
 // CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
-// CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
-// CHECK9:       arraydestroy.done1:
+// CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK9:       arraydestroy.done2:
 // CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
-// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK9-NEXT:    ret i32 [[TMP1]]
+// CHECK9-NEXT:    [[TMP5:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK9-NEXT:    ret i32 [[TMP5]]
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN2SSC2ERi
@@ -1921,67 +1925,63 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_outlined..3
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
-// CHECK9-NEXT:    [[VEC2:%.*]] = alloca [2 x i32], align 128
-// CHECK9-NEXT:    [[S_ARR3:%.*]] = alloca [2 x %struct.S.0], align 128
+// CHECK9-NEXT:    [[VEC1:%.*]] = alloca [2 x i32], align 128
+// CHECK9-NEXT:    [[S_ARR2:%.*]] = alloca [2 x %struct.S.0], align 128
 // CHECK9-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
-// CHECK9-NEXT:    [[VAR5:%.*]] = alloca [[STRUCT_S_0:%.*]], align 128
-// CHECK9-NEXT:    [[AGG_TMP6:%.*]] = alloca [[STRUCT_ST]], align 4
+// CHECK9-NEXT:    [[VAR4:%.*]] = alloca [[STRUCT_S_0:%.*]], align 128
+// CHECK9-NEXT:    [[AGG_TMP5:%.*]] = alloca [[STRUCT_ST]], align 4
 // CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 128
-// CHECK9-NEXT:    store i32 [[TMP4]], ptr [[T_VAR1]], align 128
-// CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC2]], ptr align 128 [[TMP0]], i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
-// CHECK9-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
-// CHECK9-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i64 8, i1 false)
+// CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK9-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// CHECK9-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE3:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK9:       omp.arraycpy.body:
-// CHECK9-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK9-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK9-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK9-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]])
 // CHECK9-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr [[AGG_TMP]])
 // CHECK9-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK9-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK9-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
-// CHECK9-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
-// CHECK9:       omp.arraycpy.done4:
-// CHECK9-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
-// CHECK9-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR5]], ptr nonnull align 4 dereferenceable(4) [[TMP3]], ptr [[AGG_TMP6]])
-// CHECK9-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
-// CHECK9-NEXT:    [[TMP6:%.*]] = load i32, ptr [[T_VAR1]], align 128
-// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i64 0, i64 0
-// CHECK9-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX]], align 128
-// CHECK9-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i64 0, i64 0
-// CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[ARRAYIDX7]], ptr align 128 [[VAR5]], i64 4, i1 false)
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
-// CHECK9-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
+// CHECK9-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// CHECK9-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK9:       omp.arraycpy.done3:
+// CHECK9-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]])
+// CHECK9-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR4]], ptr nonnull align 4 dereferenceable(4) [[TMP2]], ptr [[AGG_TMP5]])
+// CHECK9-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK9-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
+// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC1]], i64 0, i64 0
+// CHECK9-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX]], align 128
+// CHECK9-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i64 0, i64 0
+// CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[ARRAYIDX6]], ptr align 128 [[VAR4]], i64 4, i1 false)
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR4]]
+// CHECK9-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
-// CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[OMP_ARRAYCPY_DONE4]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[OMP_ARRAYCPY_DONE3]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
 // CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
-// CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
-// CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
-// CHECK9:       arraydestroy.done9:
+// CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
+// CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK9:       arraydestroy.done8:
 // CHECK9-NEXT:    ret void
 //
 //
@@ -2011,18 +2011,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_outlined..4
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[T_VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
+// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK9-NEXT:    store i32 [[TMP1]], ptr [[T_VAR1]], align 128
+// CHECK9-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK9-NEXT:    ret void
 //
 //
@@ -2331,27 +2327,23 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_outlined..2
-// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK11-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK11-NEXT:    [[G_ADDR:%.*]] = alloca i64, align 8
 // CHECK11-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i64, align 8
-// CHECK11-NEXT:    [[G1:%.*]] = alloca i32, align 128
 // CHECK11-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
 // CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK11-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK11-NEXT:    store i64 [[G]], ptr [[G_ADDR]], align 8
 // CHECK11-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
-// CHECK11-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr [[TMP0]], align 128
-// CHECK11-NEXT:    store i32 [[TMP1]], ptr [[G1]], align 128
-// CHECK11-NEXT:    store i32 1, ptr [[G1]], align 128
+// CHECK11-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
 // CHECK11-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[REF_TMP]], i32 0, i32 0
-// CHECK11-NEXT:    store ptr [[G1]], ptr [[TMP2]], align 8
-// CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK11-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP3]], align 8
+// CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK11-NEXT:    store ptr [[G_ADDR]], ptr [[TMP0]], align 8
+// CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[REF_TMP]], i32 0, i32 1
+// CHECK11-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP1]], align 8
 // CHECK11-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr nonnull align 8 dereferenceable(16) [[REF_TMP]])
 // CHECK11-NEXT:    ret void
 //
@@ -2401,33 +2393,33 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:  entry:
 // CHECK12-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK12-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8
+// CHECK12-NEXT:    [[G_CASTED:%.*]] = alloca i64, align 8
 // CHECK12-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i64, align 8
 // CHECK12-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // CHECK12-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR]], align 8
-// CHECK12-NEXT:    [[TMP0:%.*]] = load i32, ptr @_ZZ4mainE5sivar, align 4
-// CHECK12-NEXT:    store i32 [[TMP0]], ptr [[SIVAR_CASTED]], align 4
-// CHECK12-NEXT:    [[TMP1:%.*]] = load i64, ptr [[SIVAR_CASTED]], align 8
-// CHECK12-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @.omp_outlined., ptr @g, i64 [[TMP1]])
+// CHECK12-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @g, align 128
+// CHECK12-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
+// CHECK12-NEXT:    [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8
+// CHECK12-NEXT:    [[TMP2:%.*]] = load i32, ptr @_ZZ4mainE5sivar, align 4
+// CHECK12-NEXT:    store i32 [[TMP2]], ptr [[SIVAR_CASTED]], align 4
+// CHECK12-NEXT:    [[TMP3:%.*]] = load i64, ptr [[SIVAR_CASTED]], align 8
+// CHECK12-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @.omp_outlined., i64 [[TMP1]], i64 [[TMP3]])
 // CHECK12-NEXT:    ret void
 //
 //
 // CHECK12-LABEL: define {{[^@]+}}@.omp_outlined.
-// CHECK12-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK12-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
 // CHECK12-NEXT:  entry:
 // CHECK12-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK12-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK12-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK12-NEXT:    [[G_ADDR:%.*]] = alloca i64, align 8
 // CHECK12-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i64, align 8
-// CHECK12-NEXT:    [[G1:%.*]] = alloca i32, align 128
 // CHECK12-NEXT:    [[BLOCK:%.*]] = alloca <{ ptr, i32, i32, ptr, ptr, i32, [92 x i8], i32 }>, align 128
 // CHECK12-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK12-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK12-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK12-NEXT:    store i64 [[G]], ptr [[G_ADDR]], align 8
 // CHECK12-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
-// CHECK12-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr [[TMP0]], align 128
-// CHECK12-NEXT:    store i32 [[TMP1]], ptr [[G1]], align 128
-// CHECK12-NEXT:    store i32 1, ptr [[G1]], align 128
+// CHECK12-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
 // CHECK12-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
 // CHECK12-NEXT:    [[BLOCK_ISA:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [92 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 0
 // CHECK12-NEXT:    store ptr @_NSConcreteStackBlock, ptr [[BLOCK_ISA]], align 128
@@ -2440,14 +2432,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    [[BLOCK_DESCRIPTOR:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [92 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 4
 // CHECK12-NEXT:    store ptr @__block_descriptor_tmp, ptr [[BLOCK_DESCRIPTOR]], align 8
 // CHECK12-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [92 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 7
-// CHECK12-NEXT:    [[TMP2:%.*]] = load volatile i32, ptr [[G1]], align 128
-// CHECK12-NEXT:    store volatile i32 [[TMP2]], ptr [[BLOCK_CAPTURED]], align 128
-// CHECK12-NEXT:    [[BLOCK_CAPTURED2:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [92 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 5
-// CHECK12-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SIVAR_ADDR]], align 4
-// CHECK12-NEXT:    store i32 [[TMP3]], ptr [[BLOCK_CAPTURED2]], align 32
-// CHECK12-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], ptr [[BLOCK]], i32 0, i32 3
-// CHECK12-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// CHECK12-NEXT:    call void [[TMP5]](ptr [[BLOCK]])
+// CHECK12-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr [[G_ADDR]], align 4
+// CHECK12-NEXT:    store volatile i32 [[TMP0]], ptr [[BLOCK_CAPTURED]], align 128
+// CHECK12-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ ptr, i32, i32, ptr, ptr, i32, [92 x i8], i32 }>, ptr [[BLOCK]], i32 0, i32 5
+// CHECK12-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIVAR_ADDR]], align 4
+// CHECK12-NEXT:    store i32 [[TMP1]], ptr [[BLOCK_CAPTURED1]], align 32
+// CHECK12-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], ptr [[BLOCK]], i32 0, i32 3
+// CHECK12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CHECK12-NEXT:    call void [[TMP3]](ptr [[BLOCK]])
 // CHECK12-NEXT:    ret void
 //
 //
diff --git a/clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp
index a120b8bfb1904..d316ee8b3e411 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
@@ -7,11 +8,10 @@
 
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY3 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY4 %s
 // expected-no-diagnostics
 
 #ifndef ARRAY
@@ -30,15 +30,6 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
-// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
-// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
-// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]* }
-// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
-// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
-// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
-// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
-// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
 template <typename T>
 T tmain() {
   S<T> ttt;
@@ -58,48 +49,14 @@ T tmain() {
 int main() {
   static int sivar;
 #ifdef LAMBDA
-  // LAMBDA: [[G:@.+]] ={{.*}} global double
-  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
-  // LAMBDA-LABEL: @main
-  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
-  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-// LAMBDA:       [[RES:%.+]] = call {{.*}}i32 @__kmpc_master(
-// LAMBDA-NEXT:  [[IS_MASTER:%.+]] = icmp ne i32 [[RES]], 0
-// LAMBDA-NEXT:  br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
-// LAMBDA:       [[THEN]]
-// LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
-// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* @{{.+}},
-// LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
-
-// LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* @{{.+}},
-// LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
-
-// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
-// LAMBDA:  call {{.*}}void @__kmpc_end_master(
-// LAMBDA-NEXT:  br label {{%?}}[[EXIT]]
-// LAMBDA:       [[EXIT]]
-// LAMBDA: ret
+
+
 #pragma omp parallel master taskloop firstprivate(g, sivar)
   for (int i = 0; i < 10; ++i) {
-    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* {{[^,]*}} [[ARG_PTR:%.+]])
-    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
-    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
-    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
-    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
-
-    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
-    // LAMBDA: define internal noundef i32 [[TASK_ENTRY]](i32 noundef %0, %{{.+}}* noalias noundef %1)
+
     g = 1;
     sivar = 11;
-    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
-    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
-    // LAMBDA: call void [[INNER_LAMBDA]](%
-    // LAMBDA: ret
     [&]() {
       g = 2;
       sivar = 22;
@@ -108,51 +65,13 @@ int main() {
   }();
   return 0;
 #elif defined(BLOCKS)
-  // BLOCKS: [[G:@.+]] ={{.*}} global double
-  // BLOCKS: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
-  // BLOCKS-LABEL: @main
-  // BLOCKS: call void {{%.+}}(i8
   ^{
-  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS:       [[RES:%.+]] = call {{.*}}i32 @__kmpc_master(
-  // BLOCKS-NEXT:  [[IS_MASTER:%.+]] = icmp ne i32 [[RES]], 0
-  // BLOCKS-NEXT:  br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
-  // BLOCKS:       [[THEN]]
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
-  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* @{{.+}},
-  // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
-
-  // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* @{{.+}},
-  // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
-  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
-  // BLOCKS:  call {{.*}}void @__kmpc_end_master(
-  // BLOCKS-NEXT:  br label {{%?}}[[EXIT]]
-  // BLOCKS:       [[EXIT]]
-  // BLOCKS: ret
+
 #pragma omp parallel master taskloop firstprivate(g, sivar)
   for (int i = 0; i < 10; ++i) {
-    // BLOCKS: define {{.+}} void {{@.+}}(i8*
-    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
-    // BLOCKS: store double 2.0{{.+}}, double*
-    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
-    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
-    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
-    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
-    // BLOCKS: ret
-
-    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
-    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
-    // BLOCKS: define internal noundef i32 [[TASK_ENTRY]](i32 noundef %0, %{{.+}}* noalias noundef %1)
+
     g = 1;
     sivar = 11;
-    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
-    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
-    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
-    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
-    // BLOCKS: call void {{%.+}}(i8
     ^{
       g = 2;
       sivar = 22;
@@ -177,303 +96,86 @@ int main() {
 #endif
 }
 
-// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
-// CHECK: define{{.*}} i{{[0-9]+}} @main()
-// CHECK: alloca [[S_DOUBLE_TY]],
-// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
-// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
-// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
-// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
-// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
-
-// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR:@.+]]([[S_DOUBLE_TY]]* {{[^,]*}} [[TEST]],
-
-// CHECK:       [[RES:%.+]] = call {{.*}}i32 @__kmpc_master(
-// CHECK-NEXT:  [[IS_MASTER:%.+]] = icmp ne i32 [[RES]], 0
-// CHECK-NEXT:  br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
-// CHECK:       [[THEN]]
+
+
 // Store original variables in capture struct.
-// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: store [2 x [[S_DOUBLE_TY]]]* %{{.+}}, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
-// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: store [[S_DOUBLE_TY]]* %{{.+}}, [[S_DOUBLE_TY]]** [[VAR_REF]],
 
 // Allocate task.
 // Returns struct kmp_task_t {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @{{.+}}, i32 %{{.+}}, i32 9, i64 120, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
-// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
-// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
-// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 16, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // Also copy address of private copy to the corresponding shareds reference.
-// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
 
 // Constructors for s_arr and var.
 // s_arr;
-// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: bitcast [2 x [[S_DOUBLE_TY]]]* %{{.+}} to [[S_DOUBLE_TY]]*
-// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* {{[^,]*}} [[S_ARR_CUR:%[^,]+]],
-// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
-// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 1
-// CHECK: icmp eq
-// CHECK: br i1
 
 // var;
-// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// CHECK-NEXT: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* {{[^,]*}} [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}},
 
 // t_var;
-// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
-// CHECK-NEXT: [[T_VAR:%.+]] = load i32, i32* %{{.+}},
-// CHECK-NEXT: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
 
 // vec;
-// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
-// CHECK-NEXT: bitcast [2 x i32]* [[PRIVATE_VEC_REF]] to i8*
-// CHECK-NEXT: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(
 
 // sivar;
-// CHECK: [[PRIVATE_SIVAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 4
-// CHECK-NEXT: [[SIVAR:%.+]] = load i32, i32* @{{.+}},
-// CHECK-NEXT: store i32 [[SIVAR]], i32* [[PRIVATE_SIVAR_REF]],
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
-// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
-// CHECK: call void @__kmpc_taskloop(%struct.ident_t* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
-// CHECK:  call {{.*}}void @__kmpc_end_master(
-// CHECK-NEXT:  br label {{%?}}[[EXIT]]
-// CHECK:       [[EXIT]]
-
-// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias noundef %0, [[S_DOUBLE_TY]]** noalias noundef %1, i32** noalias noundef %2, [2 x [[S_DOUBLE_TY]]]** noalias noundef %3, [2 x i32]** noalias noundef %4, i32** noalias noundef %5)
-// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
-// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
-// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
-// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
-// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
-// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
-// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
-// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
-// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
-// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
-// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
-// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
-// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
-// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
-// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
-// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
-// CHECK: ret void
-
-// CHECK: define internal noundef i32 [[TASK_ENTRY]](i32 noundef %0, [[KMP_TASK_MAIN_TY]]* noalias noundef %1)
-
-// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
-// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
-// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
-// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
-// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
-// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
-// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
-
-// CHECK: [[FN:%.+]] = bitcast void (i8*, ...)* [[MAP_FN]] to void (i8*,
-// CHECK: call void [[FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
-
-// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
-// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
-// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
-// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
-// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+
+
+
+
 
 // Privates actually are used.
-// CHECK-DAG: [[PRIV_VAR]]
-// CHECK-DAG: [[PRIV_T_VAR]]
-// CHECK-DAG: [[PRIV_S_ARR]]
-// CHECK-DAG: [[PRIV_VEC]]
-// CHECK-DAG: [[PRIV_SIVAR]]
-
-// CHECK: ret
-
-// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]* noundef %0, [[KMP_TASK_MAIN_TY]]* noundef %1, i32 noundef %2)
-// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
-// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
-// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
-// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
-// CHECK: br i1 %
-
-// CHECK: phi [[S_DOUBLE_TY]]*
-// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
-// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i32 1
-// CHECK: icmp eq [[S_DOUBLE_TY]]* %
-// CHECK: br i1 %
-
-// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
-// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
-// CHECK: ret void
-
-// CHECK: define internal noundef i32 [[DESTRUCTORS]](i32 noundef %{{.+}}, [[KMP_TASK_MAIN_TY]]* noalias noundef %{{.+}})
-// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// CHECK: call void @_ZN1SIdED1Ev([[S_DOUBLE_TY]]* {{[^,]*}} [[PRIVATE_VAR_REF]])
-// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
-// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
-// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
-// CHECK: call void @_ZN1SIdED1Ev([[S_DOUBLE_TY]]* {{[^,]*}} [[PRIVATE_S_ARR_ELEM_REF]])
-// CHECK: icmp eq
-// CHECK: br i1
-// CHECK: ret i32
-
-// CHECK: alloca [[S_INT_TY]],
-// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
-// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
-// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
-// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
-// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
-
-// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{[^,]*}} [[TEST]],
+
+
+
+
+
+
+
 
 // Store original variables in capture struct.
-// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: store [2 x [[S_INT_TY]]]* %{{.+}}, [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
-// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: store [[S_INT_TY]]* %{{.+}}, [[S_INT_TY]]** [[VAR_REF]],
 
 // Allocate task.
 // Returns struct kmp_task_t {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_TMAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @{{.+}}, i32 %{{.+}}, i32 9, i64 256, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
-// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
-// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
-// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 16, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
-// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
 
 // t_var;
-// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-// CHECK: [[T_VAR:%.+]] = load i32, i32* %{{.+}}, align 128
-// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]], align 128
 
 // vec;
-// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// CHECK-NEXT: bitcast [2 x i32]* [[PRIVATE_VEC_REF]] to i8*
-// CHECK-NEXT: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
 
 // Constructors for s_arr and var.
 // a_arr;
-// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
-// CHECK: bitcast [2 x [[S_INT_TY]]]* %{{.+}} to [[S_INT_TY]]*
-// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
-// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* {{[^,]*}} [[S_ARR_CUR:%[^,]+]],
-// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
-// CHECK: icmp eq
-// CHECK: br i1
 
 // var;
-// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
-// CHECK-NEXT: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* {{[^,]*}} [[PRIVATE_VAR_REF]],
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
-// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
-// CHECK: call void @__kmpc_taskloop(%struct.ident_t* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
-
-// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias noundef %{{.+}}, i32** noalias noundef %{{.+}}, [2 x i32]** noalias noundef %{{.+}}, [2 x [[S_INT_TY]]]** noalias noundef %{{.+}}, [[S_INT_TY]]** noalias noundef %{{.+}})
-// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
-// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
-// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
-// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
-// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
-// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
-// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
-// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
-// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
-// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
-// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
-// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
-// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
-// CHECK: ret void
-
-// CHECK: define internal noundef i32 [[TASK_ENTRY]](i32 noundef %0, [[KMP_TASK_TMAIN_TY]]* noalias noundef %1)
-// CHECK: alloca i32*,
-// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
-// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
-// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
-// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
-// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
-// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
-// CHECK: [[FN:%.+]] = bitcast void (i8*, ...)* [[MAP_FN]] to void (i8*,
-// CHECK: call void [[FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
-// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
-// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
-// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
-// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+
 
 // Privates actually are used.
-// CHECK-DAG: [[PRIV_VAR]]
-// CHECK-DAG: [[PRIV_T_VAR]]
-// CHECK-DAG: [[PRIV_S_ARR]]
-// CHECK-DAG: [[PRIV_VEC]]
-
-// CHECK: ret
-
-// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]* noundef %0, [[KMP_TASK_TMAIN_TY]]* noundef %1, i32 noundef %2)
-// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
-// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
-// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
-// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
-// CHECK: br i1 %
-
-// CHECK: phi [[S_INT_TY]]*
-// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
-// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i32 1
-// CHECK: icmp eq [[S_INT_TY]]* %
-// CHECK: br i1 %
-
-// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
-// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
-// CHECK: ret void
-
-// CHECK: define internal noundef i32 [[DESTRUCTORS]](i32 noundef %0, [[KMP_TASK_TMAIN_TY]]* noalias noundef %1)
-// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
-// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
-// CHECK: call void @_ZN1SIiED1Ev([[S_INT_TY]]* {{[^,]*}} [[PRIVATE_VAR_REF]])
-// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
-// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
-// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
-// CHECK: call void @_ZN1SIiED1Ev([[S_INT_TY]]* {{[^,]*}} [[PRIVATE_S_ARR_ELEM_REF]])
-// CHECK: icmp eq
-// CHECK: br i1
-// CHECK: ret i32
+
+
+
+
+
 
 #endif
 #else
-// ARRAY-LABEL: array_func
 struct St {
   int a, b;
   St() : a(0), b(0) {}
@@ -482,13 +184,2207 @@ struct St {
 };
 
 void array_func(int n, float a[n], St s[2]) {
-// ARRAY: call i8* @__kmpc_omp_task_alloc(
-// ARRAY: call void @__kmpc_taskloop(
-// ARRAY: store float** %{{.+}}, float*** %{{.+}},
-// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
 #pragma omp parallel master taskloop firstprivate(a, s)
   for (int i = 0; i < 10; ++i)
     ;
 }
 #endif
 
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S:%.*]], align 8
+// CHECK-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S]], align 8
+// CHECK-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S], align 16
+// CHECK-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S]], align 8
+// CHECK-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]])
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
+// CHECK-NEXT:    store i32 0, i32* [[T_VAR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const.main.vec to i8*), i64 8, i1 false)
+// CHECK-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYINIT_BEGIN]], i64 1
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_VAR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[T_VAR_CASTED]] to i32*
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[CONV]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[T_VAR_CASTED]], align 8
+// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [2 x i32]*, i64, [2 x %struct.S]*, %struct.S*)* @.omp_outlined. to void (i32*, i32*, ...)*), [2 x i32]* [[VEC]], i64 [[TMP2]], [2 x %struct.S]* [[S_ARR]], %struct.S* [[VAR]])
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK:       arraydestroy.body:
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP3]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK:       arraydestroy.done1:
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ev
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdEC2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC1ERKS0_d
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// CHECK-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[T_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdEC2ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP0]], double noundef [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ed
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdEC2Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], double noundef [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
+// CHECK-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], [2 x i32]* noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], [2 x %struct.S]* noundef nonnull align 8 dereferenceable(16) [[S_ARR:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca [2 x i32]*, align 8
+// CHECK-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[S_ARR_ADDR:%.*]] = alloca [2 x %struct.S]*, align 8
+// CHECK-NEXT:    [[VAR_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT:    store [2 x i32]* [[VEC]], [2 x i32]** [[VEC_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[T_VAR]], i64* [[T_VAR_ADDR]], align 8
+// CHECK-NEXT:    store [2 x %struct.S]* [[S_ARR]], [2 x %struct.S]** [[S_ARR_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S* [[VAR]], %struct.S** [[VAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i32]*, [2 x i32]** [[VEC_ADDR]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load [2 x %struct.S]*, [2 x %struct.S]** [[S_ARR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.S*, %struct.S** [[VAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+// CHECK-NEXT:    br i1 [[TMP6]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// CHECK:       omp_if.then:
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x %struct.S]* [[TMP1]], [2 x %struct.S]** [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 1
+// CHECK-NEXT:    store %struct.S* [[TMP2]], %struct.S** [[TMP8]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    [[TMP9:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]], i32 9, i64 120, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to %struct.kmp_task_t_with_privates*
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP13]], i8* align 8 [[TMP14]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP10]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP16]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast [2 x %struct.S]* [[TMP1]] to %struct.S*
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S* [[ARRAY_BEGIN]], [[TMP18]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK:       omp.arraycpy.body:
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP17]], [[OMP_IF_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN]], [[OMP_IF_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[OMP_ARRAYCPY_SRCELEMENTPAST]], double noundef 0.000000e+00)
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP18]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE1]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK:       omp.arraycpy.done1:
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 1
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP19]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP2]], double noundef 0.000000e+00)
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP20]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP23:%.*]] = bitcast [2 x i32]* [[TMP22]] to i8*
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast [2 x i32]* [[TMP0]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP23]], i8* align 4 [[TMP24]], i64 8, i1 false)
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* @_ZZ4mainE5sivar, align 4
+// CHECK-NEXT:    store i32 [[TMP26]], i32* [[TMP25]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast %union.kmp_cmplrdata_t* [[TMP27]] to i32 (i32, i8*)**
+// CHECK-NEXT:    store i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_destructor. to i32 (i32, i8*)*), i32 (i32, i8*)** [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 5
+// CHECK-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 6
+// CHECK-NEXT:    store i64 9, i64* [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 7
+// CHECK-NEXT:    store i64 1, i64* [[TMP31]], align 8
+// CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 9
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i8*
+// CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP33]], i8 0, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP34:%.*]] = load i64, i64* [[TMP31]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]], i8* [[TMP9]], i32 1, i64* [[TMP29]], i64* [[TMP30]], i64 [[TMP34]], i32 1, i32 0, i64 0, i8* bitcast (void (%struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates*, i32)* @.omp_task_dup. to i8*))
+// CHECK-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    br label [[OMP_IF_END]]
+// CHECK:       omp_if.end:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], %struct.S** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]], [2 x %struct.S]** noalias noundef [[TMP3:%.*]], [2 x i32]** noalias noundef [[TMP4:%.*]], i32** noalias noundef [[TMP5:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.S**, align 8
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i32**, align 8
+// CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca [2 x %struct.S]**, align 8
+// CHECK-NEXT:    [[DOTADDR4:%.*]] = alloca [2 x i32]**, align 8
+// CHECK-NEXT:    [[DOTADDR5:%.*]] = alloca i32**, align 8
+// CHECK-NEXT:    store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// CHECK-NEXT:    store %struct.S** [[TMP1]], %struct.S*** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store i32** [[TMP2]], i32*** [[DOTADDR2]], align 8
+// CHECK-NEXT:    store [2 x %struct.S]** [[TMP3]], [2 x %struct.S]*** [[DOTADDR3]], align 8
+// CHECK-NEXT:    store [2 x i32]** [[TMP4]], [2 x i32]*** [[DOTADDR4]], align 8
+// CHECK-NEXT:    store i32** [[TMP5]], i32*** [[DOTADDR5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP8:%.*]] = load [2 x %struct.S]**, [2 x %struct.S]*** [[DOTADDR3]], align 8
+// CHECK-NEXT:    store [2 x %struct.S]* [[TMP7]], [2 x %struct.S]** [[TMP8]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load %struct.S**, %struct.S*** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store %struct.S* [[TMP9]], %struct.S** [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32**, i32*** [[DOTADDR2]], align 8
+// CHECK-NEXT:    store i32* [[TMP11]], i32** [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP14:%.*]] = load [2 x i32]**, [2 x i32]*** [[DOTADDR4]], align 8
+// CHECK-NEXT:    store [2 x i32]* [[TMP13]], [2 x i32]** [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32**, i32*** [[DOTADDR5]], align 8
+// CHECK-NEXT:    store i32* [[TMP15]], i32** [[TMP16]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// CHECK-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca [2 x %struct.S]*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR3_I:%.*]] = alloca [2 x i32]*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR4_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// CHECK-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// CHECK-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, %struct.S**, i32**, [2 x %struct.S]**, [2 x i32]**, i32**)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !14
+// CHECK-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP22:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP23:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast void (i8*, ...)* [[TMP23]] to void (i8*, %struct.S**, i32**, [2 x %struct.S]**, [2 x i32]**, i32**)*
+// CHECK-NEXT:    call void [[TMP25]](i8* [[TMP24]], %struct.S** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], [2 x %struct.S]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], [2 x i32]** [[DOTFIRSTPRIV_PTR_ADDR3_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR4_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP26:%.*]] = load %struct.S*, %struct.S** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP28:%.*]] = load [2 x %struct.S]*, [2 x %struct.S]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP29:%.*]] = load [2 x i32]*, [2 x i32]** [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP30:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR4_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP31:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP31]] to i32
+// CHECK-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// CHECK:       omp.inner.for.cond.i:
+// CHECK-NEXT:    [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// CHECK-NEXT:    [[CONV5_I:%.*]] = sext i32 [[TMP32]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV5_I]], [[TMP33]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
+// CHECK:       omp.inner.for.body.i:
+// CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// CHECK-NEXT:    store i32 [[TMP34]], i32* [[I_I]], align 4, !noalias !14
+// CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[TMP27]], align 4
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[TMP29]], i64 0, i64 0
+// CHECK-NEXT:    store i32 [[TMP35]], i32* [[ARRAYIDX_I]], align 4
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP28]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast %struct.S* [[ARRAYIDX6_I]] to i8*
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast %struct.S* [[TMP26]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP36]], i8* align 8 [[TMP37]], i64 8, i1 false)
+// CHECK-NEXT:    store i32 33, i32* [[TMP30]], align 4
+// CHECK-NEXT:    [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// CHECK-NEXT:    [[ADD7_I:%.*]] = add nsw i32 [[TMP38]], 1
+// CHECK-NEXT:    store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND_I]]
+// CHECK:       .omp_outlined..1.exit:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_dup.
+// CHECK-SAME: (%struct.kmp_task_t_with_privates* noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noundef [[TMP1:%.*]], i32 noundef [[TMP2:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP0]], %struct.kmp_task_t_with_privates** [[DOTADDR]], align 8
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTADDR2]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP5]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP8]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP9]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = load [2 x %struct.S]*, [2 x %struct.S]** [[TMP11]], align 8
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x %struct.S]* [[TMP12]] to %struct.S*
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S* [[ARRAY_BEGIN]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE3:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK:       omp.arraycpy.body:
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP13]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[OMP_ARRAYCPY_SRCELEMENTPAST]], double noundef 0.000000e+00)
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK:       omp.arraycpy.done3:
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP8]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP9]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP17:%.*]] = load %struct.S*, %struct.S** [[TMP16]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP15]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP17]], double noundef 0.000000e+00)
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_destructor.
+// CHECK-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP2]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 1
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP5]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK:       arraydestroy.body:
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP6]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK:       arraydestroy.done2:
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP7]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdED1Ev
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdED2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0]], align 4
+// CHECK-NEXT:    [[T_VAR:%.*]] = alloca i32, align 128
+// CHECK-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 4
+// CHECK-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]])
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
+// CHECK-NEXT:    store i32 0, i32* [[T_VAR]], align 128
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const._Z5tmainIiET_v.vec to i8*), i64 8, i1 false)
+// CHECK-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYINIT_BEGIN]], i64 1
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_VAR]], align 128
+// CHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[T_VAR_CASTED]] to i32*
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[CONV]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[T_VAR_CASTED]], align 8
+// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [2 x i32]*, i64, [2 x %struct.S.0]*, %struct.S.0*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), [2 x i32]* [[VEC]], i64 [[TMP2]], [2 x %struct.S.0]* [[S_ARR]], %struct.S.0* [[VAR]])
+// CHECK-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK:       arraydestroy.body:
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP3]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S.0* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK:       arraydestroy.done1:
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ev
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    store double 0.000000e+00, double* [[F]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC2ERKS0_d
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// CHECK-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// CHECK-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[F2]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[T_ADDR]], align 8
+// CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    store double [[ADD]], double* [[F]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ed
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// CHECK-NEXT:    store double [[TMP0]], double* [[F]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdED2Ev
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIiEC2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_i
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// CHECK-NEXT:    call void @_ZN1SIiEC2ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP0]], i32 noundef [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    call void @_ZN1SIiEC2Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2
+// CHECK-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], [2 x i32]* noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], [2 x %struct.S.0]* noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca [2 x i32]*, align 8
+// CHECK-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[S_ARR_ADDR:%.*]] = alloca [2 x %struct.S.0]*, align 8
+// CHECK-NEXT:    [[VAR_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_1:%.*]], align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT:    store [2 x i32]* [[VEC]], [2 x i32]** [[VEC_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[T_VAR]], i64* [[T_VAR_ADDR]], align 8
+// CHECK-NEXT:    store [2 x %struct.S.0]* [[S_ARR]], [2 x %struct.S.0]** [[S_ARR_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S.0* [[VAR]], %struct.S.0** [[VAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i32]*, [2 x i32]** [[VEC_ADDR]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load [2 x %struct.S.0]*, [2 x %struct.S.0]** [[S_ARR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.S.0*, %struct.S.0** [[VAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+// CHECK-NEXT:    br i1 [[TMP6]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// CHECK:       omp_if.then:
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ANON_1]], %struct.anon.1* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x %struct.S.0]* [[TMP1]], [2 x %struct.S.0]** [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ANON_1]], %struct.anon.1* [[AGG_CAPTURED]], i32 0, i32 1
+// CHECK-NEXT:    store %struct.S.0* [[TMP2]], %struct.S.0** [[TMP8]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    [[TMP9:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]], i32 9, i64 256, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.2*)* @.omp_task_entry..5 to i32 (i32, i8*)*))
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to %struct.kmp_task_t_with_privates.2*
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 128
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.anon.1* [[AGG_CAPTURED]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP13]], i8* align 8 [[TMP14]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP10]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP15]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP16]], align 128
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP15]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast [2 x i32]* [[TMP18]] to i8*
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast [2 x i32]* [[TMP0]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP19]], i8* align 4 [[TMP20]], i64 8, i1 false)
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP15]], i32 0, i32 2
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP21]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast [2 x %struct.S.0]* [[TMP1]] to %struct.S.0*
+// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S.0* [[ARRAY_BEGIN]], [[TMP23]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK:       omp.arraycpy.body:
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP22]], [[OMP_IF_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN]], [[OMP_IF_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 noundef 0)
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP23]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE1]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK:       omp.arraycpy.done1:
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP15]], i32 0, i32 3
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP24]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP2]], i32 noundef 0)
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast %union.kmp_cmplrdata_t* [[TMP25]] to i32 (i32, i8*)**
+// CHECK-NEXT:    store i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.2*)* @.omp_task_destructor..7 to i32 (i32, i8*)*), i32 (i32, i8*)** [[TMP26]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 5
+// CHECK-NEXT:    store i64 0, i64* [[TMP27]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 6
+// CHECK-NEXT:    store i64 9, i64* [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 7
+// CHECK-NEXT:    store i64 1, i64* [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 9
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i8*
+// CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP31]], i8 0, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP32:%.*]] = load i64, i64* [[TMP29]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]], i8* [[TMP9]], i32 1, i64* [[TMP27]], i64* [[TMP28]], i64 [[TMP32]], i32 1, i32 0, i64 0, i8* bitcast (void (%struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2*, i32)* @.omp_task_dup..6 to i8*))
+// CHECK-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    br label [[OMP_IF_END]]
+// CHECK:       omp_if.end:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_privates_map..4
+// CHECK-SAME: (%struct..kmp_privates.t.3* noalias noundef [[TMP0:%.*]], i32** noalias noundef [[TMP1:%.*]], [2 x i32]** noalias noundef [[TMP2:%.*]], [2 x %struct.S.0]** noalias noundef [[TMP3:%.*]], %struct.S.0** noalias noundef [[TMP4:%.*]]) #[[ATTR6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t.3*, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32**, align 8
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca [2 x i32]**, align 8
+// CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca [2 x %struct.S.0]**, align 8
+// CHECK-NEXT:    [[DOTADDR4:%.*]] = alloca %struct.S.0**, align 8
+// CHECK-NEXT:    store %struct..kmp_privates.t.3* [[TMP0]], %struct..kmp_privates.t.3** [[DOTADDR]], align 8
+// CHECK-NEXT:    store i32** [[TMP1]], i32*** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store [2 x i32]** [[TMP2]], [2 x i32]*** [[DOTADDR2]], align 8
+// CHECK-NEXT:    store [2 x %struct.S.0]** [[TMP3]], [2 x %struct.S.0]*** [[DOTADDR3]], align 8
+// CHECK-NEXT:    store %struct.S.0** [[TMP4]], %struct.S.0*** [[DOTADDR4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load %struct..kmp_privates.t.3*, %struct..kmp_privates.t.3** [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP5]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32**, i32*** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store i32* [[TMP6]], i32** [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP5]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP9:%.*]] = load [2 x i32]**, [2 x i32]*** [[DOTADDR2]], align 8
+// CHECK-NEXT:    store [2 x i32]* [[TMP8]], [2 x i32]** [[TMP9]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP5]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP11:%.*]] = load [2 x %struct.S.0]**, [2 x %struct.S.0]*** [[DOTADDR3]], align 8
+// CHECK-NEXT:    store [2 x %struct.S.0]* [[TMP10]], [2 x %struct.S.0]** [[TMP11]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP13:%.*]] = load %struct.S.0**, %struct.S.0*** [[DOTADDR4]], align 8
+// CHECK-NEXT:    store %struct.S.0* [[TMP12]], %struct.S.0** [[TMP13]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_entry..5
+// CHECK-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.2* noalias noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// CHECK-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon.1*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca [2 x i32]*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca [2 x %struct.S.0]*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR3_I:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates.2* [[TMP1]], %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 128
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.1*
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t.3* [[TMP9]] to i8*
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates.2* [[TMP3]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// CHECK-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META26:![0-9]+]])
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !28
+// CHECK-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t.3*, i32**, [2 x i32]**, [2 x %struct.S.0]**, %struct.S.0**)* @.omp_task_privates_map..4 to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !28
+// CHECK-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    store %struct.anon.1* [[TMP8]], %struct.anon.1** [[__CONTEXT_ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[TMP22:%.*]] = load %struct.anon.1*, %struct.anon.1** [[__CONTEXT_ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[TMP23:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast void (i8*, ...)* [[TMP23]] to void (i8*, i32**, [2 x i32]**, [2 x %struct.S.0]**, %struct.S.0**)*
+// CHECK-NEXT:    call void [[TMP25]](i8* [[TMP24]], i32** [[DOTFIRSTPRIV_PTR_ADDR_I]], [2 x i32]** [[DOTFIRSTPRIV_PTR_ADDR1_I]], [2 x %struct.S.0]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], %struct.S.0** [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[TMP27:%.*]] = load [2 x i32]*, [2 x i32]** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[TMP28:%.*]] = load [2 x %struct.S.0]*, [2 x %struct.S.0]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[TMP29:%.*]] = load %struct.S.0*, %struct.S.0** [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[TMP30:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP30]] to i32
+// CHECK-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !28
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// CHECK:       omp.inner.for.cond.i:
+// CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !28
+// CHECK-NEXT:    [[CONV4_I:%.*]] = sext i32 [[TMP31]] to i64
+// CHECK-NEXT:    [[TMP32:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !28
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV4_I]], [[TMP32]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
+// CHECK:       omp.inner.for.body.i:
+// CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !28
+// CHECK-NEXT:    store i32 [[TMP33]], i32* [[I_I]], align 4, !noalias !28
+// CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[TMP26]], align 128
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[TMP27]], i64 0, i64 0
+// CHECK-NEXT:    store i32 [[TMP34]], i32* [[ARRAYIDX_I]], align 4
+// CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP28]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast %struct.S.0* [[ARRAYIDX5_I]] to i8*
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast %struct.S.0* [[TMP29]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP35]], i8* align 4 [[TMP36]], i64 4, i1 false)
+// CHECK-NEXT:    [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !28
+// CHECK-NEXT:    [[ADD6_I:%.*]] = add nsw i32 [[TMP37]], 1
+// CHECK-NEXT:    store i32 [[ADD6_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !28
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND_I]]
+// CHECK:       .omp_outlined..3.exit:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_dup..6
+// CHECK-SAME: (%struct.kmp_task_t_with_privates.2* noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.2* noundef [[TMP1:%.*]], i32 noundef [[TMP2:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates.2* [[TMP0]], %struct.kmp_task_t_with_privates.2** [[DOTADDR]], align 8
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates.2* [[TMP1]], %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTADDR2]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP5]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 128
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.1*
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP8]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANON_1:%.*]], %struct.anon.1* [[TMP9]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = load [2 x %struct.S.0]*, [2 x %struct.S.0]** [[TMP11]], align 8
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x %struct.S.0]* [[TMP12]] to %struct.S.0*
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S.0* [[ARRAY_BEGIN]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE3:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK:       omp.arraycpy.body:
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP13]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 noundef 0)
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK:       omp.arraycpy.done3:
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP8]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON_1]], %struct.anon.1* [[TMP9]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP17:%.*]] = load %struct.S.0*, %struct.S.0** [[TMP16]], align 8
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP15]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP17]], i32 noundef 0)
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_destructor..7
+// CHECK-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.2* noalias noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates.2* [[TMP1]], %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP2]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP3]], i32 0, i32 3
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP5]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK:       arraydestroy.body:
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP6]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S.0* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK:       arraydestroy.done2:
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP7]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIiED2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    store i32 0, i32* [[F]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_i
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// CHECK-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[F2]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    store i32 [[ADD]], i32* [[F]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[F]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// LAMBDA-LABEL: define {{[^@]+}}@main
+// LAMBDA-SAME: () #[[ATTR0:[0-9]+]] {
+// LAMBDA-NEXT:  entry:
+// LAMBDA-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 1
+// LAMBDA-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// LAMBDA-NEXT:    call void @"_ZZ4mainENK3$_0clEv"(%class.anon* noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// LAMBDA-NEXT:    ret i32 0
+//
+//
+// LAMBDA-LABEL: define {{[^@]+}}@.omp_outlined.
+// LAMBDA-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// LAMBDA-NEXT:  entry:
+// LAMBDA-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// LAMBDA-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// LAMBDA-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// LAMBDA-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// LAMBDA-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// LAMBDA-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// LAMBDA-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// LAMBDA-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]])
+// LAMBDA-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+// LAMBDA-NEXT:    br i1 [[TMP3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// LAMBDA:       omp_if.then:
+// LAMBDA-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// LAMBDA-NEXT:    [[TMP4:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// LAMBDA-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct.kmp_task_t_with_privates*
+// LAMBDA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP5]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP5]], i32 0, i32 1
+// LAMBDA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP7]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP9:%.*]] = load volatile double, double* @g, align 8
+// LAMBDA-NEXT:    store volatile double [[TMP9]], double* [[TMP8]], align 8
+// LAMBDA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP7]], i32 0, i32 1
+// LAMBDA-NEXT:    [[TMP11:%.*]] = load i32, i32* @_ZZ4mainE5sivar, align 4
+// LAMBDA-NEXT:    store i32 [[TMP11]], i32* [[TMP10]], align 8
+// LAMBDA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 5
+// LAMBDA-NEXT:    store i64 0, i64* [[TMP12]], align 8
+// LAMBDA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 6
+// LAMBDA-NEXT:    store i64 9, i64* [[TMP13]], align 8
+// LAMBDA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 7
+// LAMBDA-NEXT:    store i64 1, i64* [[TMP14]], align 8
+// LAMBDA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 9
+// LAMBDA-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i8*
+// LAMBDA-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP16]], i8 0, i64 8, i1 false)
+// LAMBDA-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP14]], align 8
+// LAMBDA-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP4]], i32 1, i64* [[TMP12]], i64* [[TMP13]], i64 [[TMP17]], i32 1, i32 0, i64 0, i8* null)
+// LAMBDA-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// LAMBDA-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// LAMBDA-NEXT:    br label [[OMP_IF_END]]
+// LAMBDA:       omp_if.end:
+// LAMBDA-NEXT:    ret void
+//
+//
+// LAMBDA-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// LAMBDA-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], double** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]]) #[[ATTR5:[0-9]+]] {
+// LAMBDA-NEXT:  entry:
+// LAMBDA-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// LAMBDA-NEXT:    [[DOTADDR1:%.*]] = alloca double**, align 8
+// LAMBDA-NEXT:    [[DOTADDR2:%.*]] = alloca i32**, align 8
+// LAMBDA-NEXT:    store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// LAMBDA-NEXT:    store double** [[TMP1]], double*** [[DOTADDR1]], align 8
+// LAMBDA-NEXT:    store i32** [[TMP2]], i32*** [[DOTADDR2]], align 8
+// LAMBDA-NEXT:    [[TMP3:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// LAMBDA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP5:%.*]] = load double**, double*** [[DOTADDR1]], align 8
+// LAMBDA-NEXT:    store double* [[TMP4]], double** [[TMP5]], align 8
+// LAMBDA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 1
+// LAMBDA-NEXT:    [[TMP7:%.*]] = load i32**, i32*** [[DOTADDR2]], align 8
+// LAMBDA-NEXT:    store i32* [[TMP6]], i32** [[TMP7]], align 8
+// LAMBDA-NEXT:    ret void
+//
+//
+// LAMBDA-LABEL: define {{[^@]+}}@.omp_task_entry.
+// LAMBDA-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// LAMBDA-NEXT:  entry:
+// LAMBDA-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// LAMBDA-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// LAMBDA-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// LAMBDA-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// LAMBDA-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// LAMBDA-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// LAMBDA-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// LAMBDA-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// LAMBDA-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// LAMBDA-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca double*, align 8
+// LAMBDA-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 8
+// LAMBDA-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[REF_TMP_I:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8
+// LAMBDA-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// LAMBDA-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// LAMBDA-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// LAMBDA-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// LAMBDA-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// LAMBDA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// LAMBDA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// LAMBDA-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// LAMBDA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// LAMBDA-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// LAMBDA-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// LAMBDA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// LAMBDA-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// LAMBDA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// LAMBDA-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// LAMBDA-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// LAMBDA-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// LAMBDA-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// LAMBDA-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 8
+// LAMBDA-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// LAMBDA-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// LAMBDA-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, double**, i32**)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP22:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP23:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP25:%.*]] = bitcast void (i8*, ...)* [[TMP23]] to void (i8*, double**, i32**)*
+// LAMBDA-NEXT:    call void [[TMP25]](i8* [[TMP24]], double** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]]) #[[ATTR3:[0-9]+]]
+// LAMBDA-NEXT:    [[TMP26:%.*]] = load double*, double** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP28:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP28]] to i32
+// LAMBDA-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// LAMBDA:       omp.inner.for.cond.i:
+// LAMBDA-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    [[CONV2_I:%.*]] = sext i32 [[TMP29]] to i64
+// LAMBDA-NEXT:    [[TMP30:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV2_I]], [[TMP30]]
+// LAMBDA-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
+// LAMBDA:       omp.inner.for.body.i:
+// LAMBDA-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    store i32 [[TMP31]], i32* [[I_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    store double 1.000000e+00, double* [[TMP26]], align 8
+// LAMBDA-NEXT:    store i32 11, i32* [[TMP27]], align 4
+// LAMBDA-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP_I]], i32 0, i32 0
+// LAMBDA-NEXT:    store double* [[TMP26]], double** [[TMP32]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP_I]], i32 0, i32 1
+// LAMBDA-NEXT:    store i32* [[TMP27]], i32** [[TMP33]], align 8, !noalias !14
+// LAMBDA-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]])
+// LAMBDA-NEXT:    [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    [[ADD3_I:%.*]] = add nsw i32 [[TMP34]], 1
+// LAMBDA-NEXT:    store i32 [[ADD3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    br label [[OMP_INNER_FOR_COND_I]]
+// LAMBDA:       .omp_outlined..1.exit:
+// LAMBDA-NEXT:    ret i32 0
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@main
+// BLOCKS-SAME: () #[[ATTR1:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// BLOCKS-NEXT:    [[TMP0:%.*]] = load i8*, i8** getelementptr inbounds ([[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to %struct.__block_literal_generic*), i32 0, i32 3), align 8
+// BLOCKS-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to void (i8*)*
+// BLOCKS-NEXT:    call void [[TMP1]](i8* noundef bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to i8*))
+// BLOCKS-NEXT:    ret i32 0
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@__main_block_invoke
+// BLOCKS-SAME: (i8* noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[BLOCK_ADDR:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*, align 8
+// BLOCKS-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// BLOCKS-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*
+// BLOCKS-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>** [[BLOCK_ADDR]], align 8
+// BLOCKS-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
+// BLOCKS-NEXT:    ret void
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@.omp_outlined.
+// BLOCKS-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// BLOCKS-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// BLOCKS-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// BLOCKS-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// BLOCKS-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// BLOCKS-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// BLOCKS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// BLOCKS-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// BLOCKS-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+// BLOCKS-NEXT:    br i1 [[TMP3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// BLOCKS:       omp_if.then:
+// BLOCKS-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// BLOCKS-NEXT:    [[TMP4:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// BLOCKS-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct.kmp_task_t_with_privates*
+// BLOCKS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP5]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP5]], i32 0, i32 1
+// BLOCKS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP7]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP9:%.*]] = load volatile double, double* @g, align 8
+// BLOCKS-NEXT:    store volatile double [[TMP9]], double* [[TMP8]], align 8
+// BLOCKS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP7]], i32 0, i32 1
+// BLOCKS-NEXT:    [[TMP11:%.*]] = load i32, i32* @_ZZ4mainE5sivar, align 4
+// BLOCKS-NEXT:    store i32 [[TMP11]], i32* [[TMP10]], align 8
+// BLOCKS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 5
+// BLOCKS-NEXT:    store i64 0, i64* [[TMP12]], align 8
+// BLOCKS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 6
+// BLOCKS-NEXT:    store i64 9, i64* [[TMP13]], align 8
+// BLOCKS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 7
+// BLOCKS-NEXT:    store i64 1, i64* [[TMP14]], align 8
+// BLOCKS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 9
+// BLOCKS-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i8*
+// BLOCKS-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP16]], i8 0, i64 8, i1 false)
+// BLOCKS-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP14]], align 8
+// BLOCKS-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP4]], i32 1, i64* [[TMP12]], i64* [[TMP13]], i64 [[TMP17]], i32 1, i32 0, i64 0, i8* null)
+// BLOCKS-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// BLOCKS-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// BLOCKS-NEXT:    br label [[OMP_IF_END]]
+// BLOCKS:       omp_if.end:
+// BLOCKS-NEXT:    ret void
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@_block_invoke
+// BLOCKS-SAME: (i8* noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[BLOCK_ADDR:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>*, align 8
+// BLOCKS-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// BLOCKS-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>*
+// BLOCKS-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>** [[BLOCK_ADDR]], align 8
+// BLOCKS-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], i32 0, i32 5
+// BLOCKS-NEXT:    store double 2.000000e+00, double* [[BLOCK_CAPTURE_ADDR]], align 8
+// BLOCKS-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], i32 0, i32 6
+// BLOCKS-NEXT:    store i32 22, i32* [[BLOCK_CAPTURE_ADDR1]], align 8
+// BLOCKS-NEXT:    ret void
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// BLOCKS-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], double** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]]) #[[ATTR6:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// BLOCKS-NEXT:    [[DOTADDR1:%.*]] = alloca double**, align 8
+// BLOCKS-NEXT:    [[DOTADDR2:%.*]] = alloca i32**, align 8
+// BLOCKS-NEXT:    store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// BLOCKS-NEXT:    store double** [[TMP1]], double*** [[DOTADDR1]], align 8
+// BLOCKS-NEXT:    store i32** [[TMP2]], i32*** [[DOTADDR2]], align 8
+// BLOCKS-NEXT:    [[TMP3:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// BLOCKS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP5:%.*]] = load double**, double*** [[DOTADDR1]], align 8
+// BLOCKS-NEXT:    store double* [[TMP4]], double** [[TMP5]], align 8
+// BLOCKS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 1
+// BLOCKS-NEXT:    [[TMP7:%.*]] = load i32**, i32*** [[DOTADDR2]], align 8
+// BLOCKS-NEXT:    store i32* [[TMP6]], i32** [[TMP7]], align 8
+// BLOCKS-NEXT:    ret void
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@.omp_task_entry.
+// BLOCKS-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// BLOCKS-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// BLOCKS-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// BLOCKS-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// BLOCKS-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// BLOCKS-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// BLOCKS-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca double*, align 8
+// BLOCKS-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 8
+// BLOCKS-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[BLOCK_I:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, align 8
+// BLOCKS-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// BLOCKS-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// BLOCKS-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// BLOCKS-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// BLOCKS-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// BLOCKS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// BLOCKS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// BLOCKS-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// BLOCKS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// BLOCKS-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// BLOCKS-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// BLOCKS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// BLOCKS-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// BLOCKS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// BLOCKS-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// BLOCKS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// BLOCKS-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// BLOCKS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// BLOCKS-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 8
+// BLOCKS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// BLOCKS-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// BLOCKS-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, double**, i32**)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP22:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP23:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP25:%.*]] = bitcast void (i8*, ...)* [[TMP23]] to void (i8*, double**, i32**)*
+// BLOCKS-NEXT:    call void [[TMP25]](i8* [[TMP24]], double** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]]) #[[ATTR4:[0-9]+]]
+// BLOCKS-NEXT:    [[TMP26:%.*]] = load double*, double** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP28:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP28]] to i32
+// BLOCKS-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// BLOCKS:       omp.inner.for.cond.i:
+// BLOCKS-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    [[CONV2_I:%.*]] = sext i32 [[TMP29]] to i64
+// BLOCKS-NEXT:    [[TMP30:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV2_I]], [[TMP30]]
+// BLOCKS-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
+// BLOCKS:       omp.inner.for.body.i:
+// BLOCKS-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    store i32 [[TMP31]], i32* [[I_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    store double 1.000000e+00, double* [[TMP26]], align 8
+// BLOCKS-NEXT:    store i32 11, i32* [[TMP27]], align 4
+// BLOCKS-NEXT:    [[BLOCK_ISA_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 0
+// BLOCKS-NEXT:    store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** [[BLOCK_ISA_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[BLOCK_FLAGS_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 1
+// BLOCKS-NEXT:    store i32 1073741824, i32* [[BLOCK_FLAGS_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[BLOCK_RESERVED_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 2
+// BLOCKS-NEXT:    store i32 0, i32* [[BLOCK_RESERVED_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    [[BLOCK_INVOKE_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 3
+// BLOCKS-NEXT:    store i8* bitcast (void (i8*)* @_block_invoke to i8*), i8** [[BLOCK_INVOKE_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[BLOCK_DESCRIPTOR_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 4
+// BLOCKS-NEXT:    store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8* }* @__block_descriptor_tmp.2 to %struct.__block_descriptor*), %struct.__block_descriptor** [[BLOCK_DESCRIPTOR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[BLOCK_CAPTURED_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 5
+// BLOCKS-NEXT:    [[TMP32:%.*]] = load volatile double, double* [[TMP26]], align 8
+// BLOCKS-NEXT:    store volatile double [[TMP32]], double* [[BLOCK_CAPTURED_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[BLOCK_CAPTURED3_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 6
+// BLOCKS-NEXT:    [[TMP33:%.*]] = load i32, i32* [[TMP27]], align 4
+// BLOCKS-NEXT:    store i32 [[TMP33]], i32* [[BLOCK_CAPTURED3_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP34:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]] to void ()*
+// BLOCKS-NEXT:    [[BLOCK_LITERAL_I:%.*]] = bitcast void ()* [[TMP34]] to %struct.__block_literal_generic*
+// BLOCKS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* [[BLOCK_LITERAL_I]], i32 0, i32 3
+// BLOCKS-NEXT:    [[TMP36:%.*]] = bitcast %struct.__block_literal_generic* [[BLOCK_LITERAL_I]] to i8*
+// BLOCKS-NEXT:    [[TMP37:%.*]] = load i8*, i8** [[TMP35]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP38:%.*]] = bitcast i8* [[TMP37]] to void (i8*)*
+// BLOCKS-NEXT:    call void [[TMP38]](i8* noundef [[TMP36]]) #[[ATTR4]]
+// BLOCKS-NEXT:    [[TMP39:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    [[ADD4_I:%.*]] = add nsw i32 [[TMP39]], 1
+// BLOCKS-NEXT:    store i32 [[ADD4_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    br label [[OMP_INNER_FOR_COND_I]]
+// BLOCKS:       .omp_outlined..1.exit:
+// BLOCKS-NEXT:    ret i32 0
+//
+//
+// ARRAY-LABEL: define {{[^@]+}}@_Z10array_funciPfP2St
+// ARRAY-SAME: (i32 noundef [[N:%.*]], float* noundef [[A:%.*]], %struct.St* noundef [[S:%.*]]) #[[ATTR0:[0-9]+]] {
+// ARRAY-NEXT:  entry:
+// ARRAY-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// ARRAY-NEXT:    [[S_ADDR:%.*]] = alloca %struct.St*, align 8
+// ARRAY-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// ARRAY-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// ARRAY-NEXT:    store %struct.St* [[S]], %struct.St** [[S_ADDR]], align 8
+// ARRAY-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// ARRAY-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// ARRAY-NEXT:    [[TMP2:%.*]] = load float*, float** [[A_ADDR]], align 8
+// ARRAY-NEXT:    [[TMP3:%.*]] = load %struct.St*, %struct.St** [[S_ADDR]], align 8
+// ARRAY-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, float*, %struct.St*)* @.omp_outlined. to void (i32*, i32*, ...)*), i64 [[TMP1]], float* [[TMP2]], %struct.St* [[TMP3]])
+// ARRAY-NEXT:    ret void
+//
+//
+// ARRAY-LABEL: define {{[^@]+}}@.omp_outlined.
+// ARRAY-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], float* noundef [[A:%.*]], %struct.St* noundef [[S:%.*]]) #[[ATTR1:[0-9]+]] {
+// ARRAY-NEXT:  entry:
+// ARRAY-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// ARRAY-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// ARRAY-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// ARRAY-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// ARRAY-NEXT:    [[S_ADDR:%.*]] = alloca %struct.St*, align 8
+// ARRAY-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// ARRAY-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// ARRAY-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// ARRAY-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// ARRAY-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// ARRAY-NEXT:    store %struct.St* [[S]], %struct.St** [[S_ADDR]], align 8
+// ARRAY-NEXT:    [[TMP0:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// ARRAY-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// ARRAY-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// ARRAY-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+// ARRAY-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+// ARRAY-NEXT:    br i1 [[TMP4]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// ARRAY:       omp_if.then:
+// ARRAY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// ARRAY-NEXT:    store i64 [[TMP0]], i64* [[TMP5]], align 8
+// ARRAY-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+// ARRAY-NEXT:    [[TMP6:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i64 96, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// ARRAY-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct.kmp_task_t_with_privates*
+// ARRAY-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP7]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
+// ARRAY-NEXT:    [[TMP11:%.*]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
+// ARRAY-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 8, i1 false)
+// ARRAY-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP7]], i32 0, i32 1
+// ARRAY-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP12]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// ARRAY-NEXT:    store float* [[TMP14]], float** [[TMP13]], align 8
+// ARRAY-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP12]], i32 0, i32 1
+// ARRAY-NEXT:    [[TMP16:%.*]] = load %struct.St*, %struct.St** [[S_ADDR]], align 8
+// ARRAY-NEXT:    store %struct.St* [[TMP16]], %struct.St** [[TMP15]], align 8
+// ARRAY-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 5
+// ARRAY-NEXT:    store i64 0, i64* [[TMP17]], align 8
+// ARRAY-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 6
+// ARRAY-NEXT:    store i64 9, i64* [[TMP18]], align 8
+// ARRAY-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 7
+// ARRAY-NEXT:    store i64 1, i64* [[TMP19]], align 8
+// ARRAY-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 9
+// ARRAY-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i8*
+// ARRAY-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP21]], i8 0, i64 8, i1 false)
+// ARRAY-NEXT:    [[TMP22:%.*]] = load i64, i64* [[TMP19]], align 8
+// ARRAY-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i8* [[TMP6]], i32 1, i64* [[TMP17]], i64* [[TMP18]], i64 [[TMP22]], i32 1, i32 0, i64 0, i8* null)
+// ARRAY-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+// ARRAY-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+// ARRAY-NEXT:    br label [[OMP_IF_END]]
+// ARRAY:       omp_if.end:
+// ARRAY-NEXT:    ret void
+//
+//
+// ARRAY-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// ARRAY-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], float*** noalias noundef [[TMP1:%.*]], %struct.St*** noalias noundef [[TMP2:%.*]]) #[[ATTR4:[0-9]+]] {
+// ARRAY-NEXT:  entry:
+// ARRAY-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// ARRAY-NEXT:    [[DOTADDR1:%.*]] = alloca float***, align 8
+// ARRAY-NEXT:    [[DOTADDR2:%.*]] = alloca %struct.St***, align 8
+// ARRAY-NEXT:    store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// ARRAY-NEXT:    store float*** [[TMP1]], float**** [[DOTADDR1]], align 8
+// ARRAY-NEXT:    store %struct.St*** [[TMP2]], %struct.St**** [[DOTADDR2]], align 8
+// ARRAY-NEXT:    [[TMP3:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// ARRAY-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP5:%.*]] = load float***, float**** [[DOTADDR1]], align 8
+// ARRAY-NEXT:    store float** [[TMP4]], float*** [[TMP5]], align 8
+// ARRAY-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 1
+// ARRAY-NEXT:    [[TMP7:%.*]] = load %struct.St***, %struct.St**** [[DOTADDR2]], align 8
+// ARRAY-NEXT:    store %struct.St** [[TMP6]], %struct.St*** [[TMP7]], align 8
+// ARRAY-NEXT:    ret void
+//
+//
+// ARRAY-LABEL: define {{[^@]+}}@.omp_task_entry.
+// ARRAY-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// ARRAY-NEXT:  entry:
+// ARRAY-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// ARRAY-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// ARRAY-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// ARRAY-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// ARRAY-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// ARRAY-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// ARRAY-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// ARRAY-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// ARRAY-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// ARRAY-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca float**, align 8
+// ARRAY-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca %struct.St**, align 8
+// ARRAY-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// ARRAY-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// ARRAY-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// ARRAY-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// ARRAY-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// ARRAY-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// ARRAY-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// ARRAY-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// ARRAY-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// ARRAY-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// ARRAY-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// ARRAY-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// ARRAY-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// ARRAY-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// ARRAY-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// ARRAY-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// ARRAY-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// ARRAY-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// ARRAY-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 8
+// ARRAY-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// ARRAY-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// ARRAY-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// ARRAY-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, float***, %struct.St***)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !14
+// ARRAY-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP22:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP22]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP24:%.*]] = load i64, i64* [[TMP23]], align 8
+// ARRAY-NEXT:    [[TMP25:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP26:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP27:%.*]] = bitcast void (i8*, ...)* [[TMP25]] to void (i8*, float***, %struct.St***)*
+// ARRAY-NEXT:    call void [[TMP27]](i8* [[TMP26]], float*** [[DOTFIRSTPRIV_PTR_ADDR_I]], %struct.St*** [[DOTFIRSTPRIV_PTR_ADDR1_I]]) #[[ATTR2:[0-9]+]]
+// ARRAY-NEXT:    [[TMP28:%.*]] = load float**, float*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP29:%.*]] = load %struct.St**, %struct.St*** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP30:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP30]] to i32
+// ARRAY-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// ARRAY-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// ARRAY:       omp.inner.for.cond.i:
+// ARRAY-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// ARRAY-NEXT:    [[CONV2_I:%.*]] = sext i32 [[TMP31]] to i64
+// ARRAY-NEXT:    [[TMP32:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV2_I]], [[TMP32]]
+// ARRAY-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
+// ARRAY:       omp.inner.for.body.i:
+// ARRAY-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// ARRAY-NEXT:    store i32 [[TMP33]], i32* [[I_I]], align 4, !noalias !14
+// ARRAY-NEXT:    [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// ARRAY-NEXT:    [[ADD3_I:%.*]] = add nsw i32 [[TMP34]], 1
+// ARRAY-NEXT:    store i32 [[ADD3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// ARRAY-NEXT:    br label [[OMP_INNER_FOR_COND_I]]
+// ARRAY:       .omp_outlined..1.exit:
+// ARRAY-NEXT:    ret i32 0
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@main
+// SIMD-ONLY0-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S:%.*]], align 8
+// SIMD-ONLY0-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S]], align 8
+// SIMD-ONLY0-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// SIMD-ONLY0-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S], align 16
+// SIMD-ONLY0-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S]], align 8
+// SIMD-ONLY0-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]])
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[T_VAR]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const.main.vec to i8*), i64 8, i1 false)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    br label [[FOR_COND:%.*]]
+// SIMD-ONLY0:       for.cond:
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// SIMD-ONLY0-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY0:       for.body:
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_VAR]], align 4
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[VEC]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = bitcast %struct.S* [[ARRAYIDX1]] to i8*
+// SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = bitcast %struct.S* [[VAR]] to i8*
+// SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false)
+// SIMD-ONLY0-NEXT:    store i32 33, i32* @_ZZ4mainE5sivar, align 4
+// SIMD-ONLY0-NEXT:    br label [[FOR_INC:%.*]]
+// SIMD-ONLY0:       for.inc:
+// SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// SIMD-ONLY0:       for.end:
+// SIMD-ONLY0-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// SIMD-ONLY0-NEXT:    store i32 [[CALL]], i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]]) #[[ATTR4:[0-9]+]]
+// SIMD-ONLY0-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// SIMD-ONLY0-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// SIMD-ONLY0:       arraydestroy.body:
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP6]], [[FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// SIMD-ONLY0-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// SIMD-ONLY0:       arraydestroy.done2:
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP7]]
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ev
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC1ERKS0_d
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load double, double* [[T_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC2ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP0]], double noundef [[TMP1]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ed
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC2Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], double noundef [[TMP0]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// SIMD-ONLY0-SAME: () #[[ATTR3:[0-9]+]] {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// SIMD-ONLY0-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0]], align 4
+// SIMD-ONLY0-NEXT:    [[T_VAR:%.*]] = alloca i32, align 128
+// SIMD-ONLY0-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// SIMD-ONLY0-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// SIMD-ONLY0-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 4
+// SIMD-ONLY0-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]])
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[T_VAR]], align 128
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const._Z5tmainIiET_v.vec to i8*), i64 8, i1 false)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    br label [[FOR_COND:%.*]]
+// SIMD-ONLY0:       for.cond:
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// SIMD-ONLY0-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY0:       for.body:
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_VAR]], align 128
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[VEC]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = bitcast %struct.S.0* [[ARRAYIDX1]] to i8*
+// SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = bitcast %struct.S.0* [[VAR]] to i8*
+// SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP3]], i8* align 4 [[TMP4]], i64 4, i1 false)
+// SIMD-ONLY0-NEXT:    br label [[FOR_INC:%.*]]
+// SIMD-ONLY0:       for.inc:
+// SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// SIMD-ONLY0:       for.end:
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// SIMD-ONLY0-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// SIMD-ONLY0:       arraydestroy.body:
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP6]], [[FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S.0* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// SIMD-ONLY0-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// SIMD-ONLY0:       arraydestroy.done2:
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP7]]
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdED1Ev
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ev
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    store double 0.000000e+00, double* [[F]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdED2Ev
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC2ERKS0_d
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP0]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load double, double* [[F2]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load double, double* [[T_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
+// SIMD-ONLY0-NEXT:    store double [[ADD]], double* [[F]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ed
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[TMP0]], double* [[F]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_i
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC2ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP0]], i32 noundef [[TMP1]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC2Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[F]], align 4
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_i
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[TMP0]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[F2]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], i32* [[F]], align 4
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store i32 [[TMP0]], i32* [[F]], align 4
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@main
+// SIMD-ONLY1-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S:%.*]], align 8
+// SIMD-ONLY1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S]], align 8
+// SIMD-ONLY1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// SIMD-ONLY1-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S], align 16
+// SIMD-ONLY1-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S]], align 8
+// SIMD-ONLY1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]])
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[T_VAR]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const.main.vec to i8*), i64 8, i1 false)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    br label [[FOR_COND:%.*]]
+// SIMD-ONLY1:       for.cond:
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// SIMD-ONLY1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY1:       for.body:
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_VAR]], align 4
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[VEC]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    [[TMP3:%.*]] = bitcast %struct.S* [[ARRAYIDX1]] to i8*
+// SIMD-ONLY1-NEXT:    [[TMP4:%.*]] = bitcast %struct.S* [[VAR]] to i8*
+// SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false)
+// SIMD-ONLY1-NEXT:    store i32 33, i32* @_ZZ4mainE5sivar, align 4
+// SIMD-ONLY1-NEXT:    br label [[FOR_INC:%.*]]
+// SIMD-ONLY1:       for.inc:
+// SIMD-ONLY1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// SIMD-ONLY1:       for.end:
+// SIMD-ONLY1-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// SIMD-ONLY1-NEXT:    store i32 [[CALL]], i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]]) #[[ATTR4:[0-9]+]]
+// SIMD-ONLY1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// SIMD-ONLY1-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// SIMD-ONLY1:       arraydestroy.body:
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP6]], [[FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// SIMD-ONLY1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// SIMD-ONLY1:       arraydestroy.done2:
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    ret i32 [[TMP7]]
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ev
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC1ERKS0_d
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load double, double* [[T_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC2ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP0]], double noundef [[TMP1]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ed
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC2Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], double noundef [[TMP0]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// SIMD-ONLY1-SAME: () #[[ATTR3:[0-9]+]] {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// SIMD-ONLY1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0]], align 4
+// SIMD-ONLY1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 128
+// SIMD-ONLY1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// SIMD-ONLY1-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// SIMD-ONLY1-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 4
+// SIMD-ONLY1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]])
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[T_VAR]], align 128
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const._Z5tmainIiET_v.vec to i8*), i64 8, i1 false)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    br label [[FOR_COND:%.*]]
+// SIMD-ONLY1:       for.cond:
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// SIMD-ONLY1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY1:       for.body:
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_VAR]], align 128
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[VEC]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    [[TMP3:%.*]] = bitcast %struct.S.0* [[ARRAYIDX1]] to i8*
+// SIMD-ONLY1-NEXT:    [[TMP4:%.*]] = bitcast %struct.S.0* [[VAR]] to i8*
+// SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP3]], i8* align 4 [[TMP4]], i64 4, i1 false)
+// SIMD-ONLY1-NEXT:    br label [[FOR_INC:%.*]]
+// SIMD-ONLY1:       for.inc:
+// SIMD-ONLY1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// SIMD-ONLY1:       for.end:
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// SIMD-ONLY1-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// SIMD-ONLY1:       arraydestroy.body:
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP6]], [[FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S.0* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// SIMD-ONLY1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// SIMD-ONLY1:       arraydestroy.done2:
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    ret i32 [[TMP7]]
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdED1Ev
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ev
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    store double 0.000000e+00, double* [[F]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdED2Ev
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC2ERKS0_d
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP0]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load double, double* [[F2]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load double, double* [[T_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
+// SIMD-ONLY1-NEXT:    store double [[ADD]], double* [[F]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ed
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[TMP0]], double* [[F]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_i
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC2ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP0]], i32 noundef [[TMP1]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC2Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[F]], align 4
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_i
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[TMP0]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[F2]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]]
+// SIMD-ONLY1-NEXT:    store i32 [[ADD]], i32* [[F]], align 4
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    store i32 [[TMP0]], i32* [[F]], align 4
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@main
+// SIMD-ONLY2-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY2-NEXT:  entry:
+// SIMD-ONLY2-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 1
+// SIMD-ONLY2-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY2-NEXT:    call void @"_ZZ4mainENK3$_0clEv"(%class.anon* noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// SIMD-ONLY2-NEXT:    ret i32 0
+//
+//
+// SIMD-ONLY3-LABEL: define {{[^@]+}}@main
+// SIMD-ONLY3-SAME: () #[[ATTR1:[0-9]+]] {
+// SIMD-ONLY3-NEXT:  entry:
+// SIMD-ONLY3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY3-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY3-NEXT:    [[TMP0:%.*]] = load i8*, i8** getelementptr inbounds ([[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to %struct.__block_literal_generic*), i32 0, i32 3), align 8
+// SIMD-ONLY3-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to void (i8*)*
+// SIMD-ONLY3-NEXT:    call void [[TMP1]](i8* noundef bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to i8*))
+// SIMD-ONLY3-NEXT:    ret i32 0
+//
+//
+// SIMD-ONLY3-LABEL: define {{[^@]+}}@__main_block_invoke
+// SIMD-ONLY3-SAME: (i8* noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] {
+// SIMD-ONLY3-NEXT:  entry:
+// SIMD-ONLY3-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca i8*, align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_ADDR:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*, align 8
+// SIMD-ONLY3-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY3-NEXT:    [[BLOCK1:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, align 8
+// SIMD-ONLY3-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*
+// SIMD-ONLY3-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>** [[BLOCK_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    store i32 0, i32* [[I]], align 4
+// SIMD-ONLY3-NEXT:    br label [[FOR_COND:%.*]]
+// SIMD-ONLY3:       for.cond:
+// SIMD-ONLY3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY3-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
+// SIMD-ONLY3-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY3:       for.body:
+// SIMD-ONLY3-NEXT:    store double 1.000000e+00, double* @g, align 8
+// SIMD-ONLY3-NEXT:    store i32 11, i32* @_ZZ4mainE5sivar, align 4
+// SIMD-ONLY3-NEXT:    [[BLOCK_ISA:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK1]], i32 0, i32 0
+// SIMD-ONLY3-NEXT:    store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** [[BLOCK_ISA]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_FLAGS:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK1]], i32 0, i32 1
+// SIMD-ONLY3-NEXT:    store i32 1073741824, i32* [[BLOCK_FLAGS]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_RESERVED:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK1]], i32 0, i32 2
+// SIMD-ONLY3-NEXT:    store i32 0, i32* [[BLOCK_RESERVED]], align 4
+// SIMD-ONLY3-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK1]], i32 0, i32 3
+// SIMD-ONLY3-NEXT:    store i8* bitcast (void (i8*)* @__main_block_invoke_2 to i8*), i8** [[BLOCK_INVOKE]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_DESCRIPTOR:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK1]], i32 0, i32 4
+// SIMD-ONLY3-NEXT:    store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8* }* @__block_descriptor_tmp.1 to %struct.__block_descriptor*), %struct.__block_descriptor** [[BLOCK_DESCRIPTOR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK1]], i32 0, i32 5
+// SIMD-ONLY3-NEXT:    [[TMP1:%.*]] = load volatile double, double* @g, align 8
+// SIMD-ONLY3-NEXT:    store volatile double [[TMP1]], double* [[BLOCK_CAPTURED]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_CAPTURED2:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK1]], i32 0, i32 6
+// SIMD-ONLY3-NEXT:    [[TMP2:%.*]] = load i32, i32* @_ZZ4mainE5sivar, align 4
+// SIMD-ONLY3-NEXT:    store i32 [[TMP2]], i32* [[BLOCK_CAPTURED2]], align 8
+// SIMD-ONLY3-NEXT:    [[TMP3:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK1]] to void ()*
+// SIMD-ONLY3-NEXT:    [[BLOCK_LITERAL:%.*]] = bitcast void ()* [[TMP3]] to %struct.__block_literal_generic*
+// SIMD-ONLY3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* [[BLOCK_LITERAL]], i32 0, i32 3
+// SIMD-ONLY3-NEXT:    [[TMP5:%.*]] = bitcast %struct.__block_literal_generic* [[BLOCK_LITERAL]] to i8*
+// SIMD-ONLY3-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP4]], align 8
+// SIMD-ONLY3-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to void (i8*)*
+// SIMD-ONLY3-NEXT:    call void [[TMP7]](i8* noundef [[TMP5]])
+// SIMD-ONLY3-NEXT:    br label [[FOR_INC:%.*]]
+// SIMD-ONLY3:       for.inc:
+// SIMD-ONLY3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY3-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// SIMD-ONLY3-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+// SIMD-ONLY3-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// SIMD-ONLY3:       for.end:
+// SIMD-ONLY3-NEXT:    ret void
+//
+//
+// SIMD-ONLY3-LABEL: define {{[^@]+}}@__main_block_invoke_2
+// SIMD-ONLY3-SAME: (i8* noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2]] {
+// SIMD-ONLY3-NEXT:  entry:
+// SIMD-ONLY3-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca i8*, align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_ADDR:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>*, align 8
+// SIMD-ONLY3-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>*
+// SIMD-ONLY3-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>** [[BLOCK_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], i32 0, i32 5
+// SIMD-ONLY3-NEXT:    store double 2.000000e+00, double* [[BLOCK_CAPTURE_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], i32 0, i32 6
+// SIMD-ONLY3-NEXT:    store i32 22, i32* [[BLOCK_CAPTURE_ADDR1]], align 8
+// SIMD-ONLY3-NEXT:    ret void
+//
+//
+// SIMD-ONLY4-LABEL: define {{[^@]+}}@_Z10array_funciPfP2St
+// SIMD-ONLY4-SAME: (i32 noundef [[N:%.*]], float* noundef [[A:%.*]], %struct.St* noundef [[S:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY4-NEXT:  entry:
+// SIMD-ONLY4-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY4-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// SIMD-ONLY4-NEXT:    [[S_ADDR:%.*]] = alloca %struct.St*, align 8
+// SIMD-ONLY4-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY4-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// SIMD-ONLY4-NEXT:    store %struct.St* [[S]], %struct.St** [[S_ADDR]], align 8
+// SIMD-ONLY4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY4-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// SIMD-ONLY4-NEXT:    store i32 0, i32* [[I]], align 4
+// SIMD-ONLY4-NEXT:    br label [[FOR_COND:%.*]]
+// SIMD-ONLY4:       for.cond:
+// SIMD-ONLY4-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY4-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY4-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY4:       for.body:
+// SIMD-ONLY4-NEXT:    br label [[FOR_INC:%.*]]
+// SIMD-ONLY4:       for.inc:
+// SIMD-ONLY4-NEXT:    [[TMP3:%.*]] = load i32, i32* [[I]], align 4
+// SIMD-ONLY4-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
+// SIMD-ONLY4-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+// SIMD-ONLY4-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// SIMD-ONLY4:       for.end:
+// SIMD-ONLY4-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp
index f4ded3c7797e9..31e610d0ddffa 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
@@ -7,11 +8,10 @@
 
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY3 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY4 %s
 // expected-no-diagnostics
 
 #ifndef ARRAY
@@ -30,15 +30,6 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
-// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
-// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
-// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]* }
-// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
-// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
-// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
-// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
-// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
 template <typename T>
 T tmain() {
   S<T> ttt;
@@ -58,48 +49,14 @@ T tmain() {
 int main() {
   static int sivar;
 #ifdef LAMBDA
-  // LAMBDA: [[G:@.+]] ={{.*}} global double
-  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
-  // LAMBDA-LABEL: @main
-  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
-  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-// LAMBDA:       [[RES:%.+]] = call {{.*}}i32 @__kmpc_master(
-// LAMBDA-NEXT:  [[IS_MASTER:%.+]] = icmp ne i32 [[RES]], 0
-// LAMBDA-NEXT:  br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
-// LAMBDA:       [[THEN]]
-// LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
-// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* @{{.+}},
-// LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
-
-// LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* @{{.+}},
-// LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
-
-// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
-// LAMBDA:  call {{.*}}void @__kmpc_end_master(
-// LAMBDA-NEXT:  br label {{%?}}[[EXIT]]
-// LAMBDA:       [[EXIT]]
-// LAMBDA: ret
+
+
 #pragma omp parallel master taskloop simd firstprivate(g, sivar)
   for (int i = 0; i < 10; ++i) {
-    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* {{[^,]*}} [[ARG_PTR:%.+]])
-    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
-    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
-    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
-    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
-
-    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
-    // LAMBDA: define internal noundef i32 [[TASK_ENTRY]](i32 noundef %0, %{{.+}}* noalias noundef %1)
+
     g = 1;
     sivar = 11;
-    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
-    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
-    // LAMBDA: call void [[INNER_LAMBDA]](%
-    // LAMBDA: ret
     [&]() {
       g = 2;
       sivar = 22;
@@ -108,51 +65,13 @@ int main() {
   }();
   return 0;
 #elif defined(BLOCKS)
-  // BLOCKS: [[G:@.+]] ={{.*}} global double
-  // BLOCKS: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
-  // BLOCKS-LABEL: @main
-  // BLOCKS: call void {{%.+}}(i8
   ^{
-  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS:       [[RES:%.+]] = call {{.*}}i32 @__kmpc_master(
-  // BLOCKS-NEXT:  [[IS_MASTER:%.+]] = icmp ne i32 [[RES]], 0
-  // BLOCKS-NEXT:  br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
-  // BLOCKS:       [[THEN]]
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
-  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* @{{.+}},
-  // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
-
-  // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* @{{.+}},
-  // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
-  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
-  // BLOCKS:  call {{.*}}void @__kmpc_end_master(
-  // BLOCKS-NEXT:  br label {{%?}}[[EXIT]]
-  // BLOCKS:       [[EXIT]]
-  // BLOCKS: ret
+
 #pragma omp parallel master taskloop simd firstprivate(g, sivar)
   for (int i = 0; i < 10; ++i) {
-    // BLOCKS: define {{.+}} void {{@.+}}(i8*
-    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
-    // BLOCKS: store double 2.0{{.+}}, double*
-    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
-    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
-    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
-    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
-    // BLOCKS: ret
-
-    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
-    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
-    // BLOCKS: define internal noundef i32 [[TASK_ENTRY]](i32 noundef %0, %{{.+}}* noalias noundef %1)
+
     g = 1;
     sivar = 11;
-    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
-    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
-    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
-    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
-    // BLOCKS: call void {{%.+}}(i8
     ^{
       g = 2;
       sivar = 22;
@@ -177,303 +96,86 @@ int main() {
 #endif
 }
 
-// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
-// CHECK: define{{.*}} i{{[0-9]+}} @main()
-// CHECK: alloca [[S_DOUBLE_TY]],
-// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
-// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
-// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
-// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
-// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
-
-// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR:@.+]]([[S_DOUBLE_TY]]* {{[^,]*}} [[TEST]],
-
-// CHECK:       [[RES:%.+]] = call {{.*}}i32 @__kmpc_master(
-// CHECK-NEXT:  [[IS_MASTER:%.+]] = icmp ne i32 [[RES]], 0
-// CHECK-NEXT:  br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
-// CHECK:       [[THEN]]
+
+
 // Store original variables in capture struct.
-// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: store [2 x [[S_DOUBLE_TY]]]* %{{.+}}, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
-// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: store [[S_DOUBLE_TY]]* %{{.+}}, [[S_DOUBLE_TY]]** [[VAR_REF]],
 
 // Allocate task.
 // Returns struct kmp_task_t {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @{{.+}}, i32 %{{.+}}, i32 9, i64 120, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
-// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
-// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
-// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 16, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // Also copy address of private copy to the corresponding shareds reference.
-// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
 
 // Constructors for s_arr and var.
 // s_arr;
-// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: bitcast [2 x [[S_DOUBLE_TY]]]* %{{.+}} to [[S_DOUBLE_TY]]*
-// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* {{[^,]*}} [[S_ARR_CUR:%[^,]+]],
-// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
-// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 1
-// CHECK: icmp eq
-// CHECK: br i1
 
 // var;
-// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// CHECK-NEXT: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* {{[^,]*}} [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}},
 
 // t_var;
-// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
-// CHECK-NEXT: [[T_VAR:%.+]] = load i32, i32* %{{.+}},
-// CHECK-NEXT: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
 
 // vec;
-// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
-// CHECK-NEXT: bitcast [2 x i32]* [[PRIVATE_VEC_REF]] to i8*
-// CHECK-NEXT: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(
 
 // sivar;
-// CHECK: [[PRIVATE_SIVAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 4
-// CHECK-NEXT: [[SIVAR:%.+]] = load i{{.+}}, i{{.+}}* @{{.+}},
-// CHECK-NEXT: store i32 [[SIVAR]], i32* [[PRIVATE_SIVAR_REF]],
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
-// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
-// CHECK: call void @__kmpc_taskloop(%struct.ident_t* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
-// CHECK:  call {{.*}}void @__kmpc_end_master(
-// CHECK-NEXT:  br label {{%?}}[[EXIT]]
-// CHECK:       [[EXIT]]
-
-// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias noundef %0, [[S_DOUBLE_TY]]** noalias noundef %1, i32** noalias noundef %2, [2 x [[S_DOUBLE_TY]]]** noalias noundef %3, [2 x i32]** noalias noundef %4, i32** noalias noundef %5)
-// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
-// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
-// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
-// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
-// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
-// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
-// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
-// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
-// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
-// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
-// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
-// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
-// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
-// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
-// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
-// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
-// CHECK: ret void
-
-// CHECK: define internal noundef i32 [[TASK_ENTRY]](i32 noundef %0, [[KMP_TASK_MAIN_TY]]* noalias noundef %1)
-
-// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
-// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
-// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
-// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
-// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
-// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
-// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
-
-// CHECK: [[FN:%.+]] = bitcast void (i8*, ...)* [[MAP_FN]] to void (i8*,
-// CHECK: call void [[FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
-
-// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
-// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
-// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
-// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
-// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+
+
+
+
 
 // Privates actually are used.
-// CHECK-DAG: [[PRIV_VAR]]
-// CHECK-DAG: [[PRIV_T_VAR]]
-// CHECK-DAG: [[PRIV_S_ARR]]
-// CHECK-DAG: [[PRIV_VEC]]
-// CHECK-DAG: [[PRIV_SIVAR]]
-
-// CHECK: ret
-
-// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]* noundef %0, [[KMP_TASK_MAIN_TY]]* noundef %1, i32 noundef %2)
-// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
-// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
-// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
-// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
-// CHECK: br i1 %
-
-// CHECK: phi [[S_DOUBLE_TY]]*
-// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
-// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i32 1
-// CHECK: icmp eq [[S_DOUBLE_TY]]* %
-// CHECK: br i1 %
-
-// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
-// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
-// CHECK: ret void
-
-// CHECK: define internal noundef i32 [[DESTRUCTORS]](i32 noundef %{{.+}}, [[KMP_TASK_MAIN_TY]]* noalias noundef %{{.+}})
-// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// CHECK: call void @_ZN1SIdED1Ev([[S_DOUBLE_TY]]* {{[^,]*}} [[PRIVATE_VAR_REF]])
-// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
-// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
-// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
-// CHECK: call void @_ZN1SIdED1Ev([[S_DOUBLE_TY]]* {{[^,]*}} [[PRIVATE_S_ARR_ELEM_REF]])
-// CHECK: icmp eq
-// CHECK: br i1
-// CHECK: ret i32
-
-// CHECK: alloca [[S_INT_TY]],
-// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
-// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
-// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
-// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
-// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
-
-// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{[^,]*}} [[TEST]],
+
+
+
+
+
+
+
 
 // Store original variables in capture struct.
-// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: store [2 x [[S_INT_TY]]]* %{{.+}}, [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
-// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: store [[S_INT_TY]]* %{{.+}}, [[S_INT_TY]]** [[VAR_REF]],
 
 // Allocate task.
 // Returns struct kmp_task_t {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_TMAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @{{.+}}, i32 %{{.+}}, i32 9, i64 256, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
-// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
-// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
-// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 16, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
-// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
 
 // t_var;
-// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-// CHECK: [[T_VAR:%.+]] = load i32, i32* %{{.+}}, align 128
-// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]], align 128
 
 // vec;
-// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// CHECK-NEXT: bitcast [2 x i32]* [[PRIVATE_VEC_REF]] to i8*
-// CHECK-NEXT: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(
 
 // Constructors for s_arr and var.
 // a_arr;
-// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
-// CHECK: bitcast [2 x [[S_INT_TY]]]* %{{.+}} to [[S_INT_TY]]*
-// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
-// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* {{[^,]*}} [[S_ARR_CUR:%[^,]+]],
-// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
-// CHECK: icmp eq
-// CHECK: br i1
 
 // var;
-// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
-// CHECK-NEXT: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* {{[^,]*}} [[PRIVATE_VAR_REF]],
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
-// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
-// CHECK: call void @__kmpc_taskloop(%struct.ident_t* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
-
-// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias noundef %{{.+}}, i32** noalias noundef %{{.+}}, [2 x i32]** noalias noundef %{{.+}}, [2 x [[S_INT_TY]]]** noalias noundef %{{.+}}, [[S_INT_TY]]** noalias noundef %{{.+}})
-// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
-// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
-// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
-// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
-// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
-// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
-// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
-// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
-// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
-// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
-// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
-// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
-// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
-// CHECK: ret void
-
-// CHECK: define internal noundef i32 [[TASK_ENTRY]](i32 noundef %0, [[KMP_TASK_TMAIN_TY]]* noalias noundef %1)
-// CHECK: alloca i32*,
-// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
-// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
-// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
-// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
-// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
-// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
-// CHECK: [[FN:%.+]] = bitcast void (i8*, ...)* [[MAP_FN]] to void (i8*,
-// CHECK: call void [[FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
-// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
-// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
-// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
-// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+
 
 // Privates actually are used.
-// CHECK-DAG: [[PRIV_VAR]]
-// CHECK-DAG: [[PRIV_T_VAR]]
-// CHECK-DAG: [[PRIV_S_ARR]]
-// CHECK-DAG: [[PRIV_VEC]]
-
-// CHECK: ret
-
-// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]* noundef %0, [[KMP_TASK_TMAIN_TY]]* noundef %1, i32 noundef %2)
-// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
-// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
-// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
-// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
-// CHECK: br i1 %
-
-// CHECK: phi [[S_INT_TY]]*
-// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
-// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i32 1
-// CHECK: icmp eq [[S_INT_TY]]* %
-// CHECK: br i1 %
-
-// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
-// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
-// CHECK: ret void
-
-// CHECK: define internal noundef i32 [[DESTRUCTORS]](i32 noundef %0, [[KMP_TASK_TMAIN_TY]]* noalias noundef %1)
-// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
-// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
-// CHECK: call void @_ZN1SIiED1Ev([[S_INT_TY]]* {{[^,]*}} [[PRIVATE_VAR_REF]])
-// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
-// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
-// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
-// CHECK: call void @_ZN1SIiED1Ev([[S_INT_TY]]* {{[^,]*}} [[PRIVATE_S_ARR_ELEM_REF]])
-// CHECK: icmp eq
-// CHECK: br i1
-// CHECK: ret i32
+
+
+
+
+
 
 #endif
 #else
-// ARRAY-LABEL: array_func
 struct St {
   int a, b;
   St() : a(0), b(0) {}
@@ -482,13 +184,2309 @@ struct St {
 };
 
 void array_func(int n, float a[n], St s[2]) {
-// ARRAY: call i8* @__kmpc_omp_task_alloc(
-// ARRAY: call void @__kmpc_taskloop(
-// ARRAY: store float** %{{.+}}, float*** %{{.+}},
-// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
 #pragma omp parallel master taskloop simd firstprivate(a, s)
   for (int i = 0; i < 10; ++i)
     ;
 }
 #endif
 
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S:%.*]], align 8
+// CHECK-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S]], align 8
+// CHECK-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S], align 16
+// CHECK-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S]], align 8
+// CHECK-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]])
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
+// CHECK-NEXT:    store i32 0, i32* [[T_VAR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const.main.vec to i8*), i64 8, i1 false)
+// CHECK-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYINIT_BEGIN]], i64 1
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_VAR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[T_VAR_CASTED]] to i32*
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[CONV]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[T_VAR_CASTED]], align 8
+// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [2 x i32]*, i64, [2 x %struct.S]*, %struct.S*)* @.omp_outlined. to void (i32*, i32*, ...)*), [2 x i32]* [[VEC]], i64 [[TMP2]], [2 x %struct.S]* [[S_ARR]], %struct.S* [[VAR]])
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK:       arraydestroy.body:
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP3]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK:       arraydestroy.done1:
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ev
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdEC2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC1ERKS0_d
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// CHECK-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[T_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdEC2ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP0]], double noundef [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ed
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdEC2Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], double noundef [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
+// CHECK-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], [2 x i32]* noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], [2 x %struct.S]* noundef nonnull align 8 dereferenceable(16) [[S_ARR:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca [2 x i32]*, align 8
+// CHECK-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[S_ARR_ADDR:%.*]] = alloca [2 x %struct.S]*, align 8
+// CHECK-NEXT:    [[VAR_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT:    store [2 x i32]* [[VEC]], [2 x i32]** [[VEC_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[T_VAR]], i64* [[T_VAR_ADDR]], align 8
+// CHECK-NEXT:    store [2 x %struct.S]* [[S_ARR]], [2 x %struct.S]** [[S_ARR_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S* [[VAR]], %struct.S** [[VAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i32]*, [2 x i32]** [[VEC_ADDR]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load [2 x %struct.S]*, [2 x %struct.S]** [[S_ARR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.S*, %struct.S** [[VAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+// CHECK-NEXT:    br i1 [[TMP6]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// CHECK:       omp_if.then:
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x %struct.S]* [[TMP1]], [2 x %struct.S]** [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 1
+// CHECK-NEXT:    store %struct.S* [[TMP2]], %struct.S** [[TMP8]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    [[TMP9:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]], i32 9, i64 120, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to %struct.kmp_task_t_with_privates*
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP13]], i8* align 8 [[TMP14]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP10]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP16]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast [2 x %struct.S]* [[TMP1]] to %struct.S*
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S* [[ARRAY_BEGIN]], [[TMP18]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK:       omp.arraycpy.body:
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP17]], [[OMP_IF_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN]], [[OMP_IF_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[OMP_ARRAYCPY_SRCELEMENTPAST]], double noundef 0.000000e+00)
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP18]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE1]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK:       omp.arraycpy.done1:
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 1
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP19]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP2]], double noundef 0.000000e+00)
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP20]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP23:%.*]] = bitcast [2 x i32]* [[TMP22]] to i8*
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast [2 x i32]* [[TMP0]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP23]], i8* align 4 [[TMP24]], i64 8, i1 false)
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP15]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* @_ZZ4mainE5sivar, align 4
+// CHECK-NEXT:    store i32 [[TMP26]], i32* [[TMP25]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast %union.kmp_cmplrdata_t* [[TMP27]] to i32 (i32, i8*)**
+// CHECK-NEXT:    store i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_destructor. to i32 (i32, i8*)*), i32 (i32, i8*)** [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 5
+// CHECK-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 6
+// CHECK-NEXT:    store i64 9, i64* [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 7
+// CHECK-NEXT:    store i64 1, i64* [[TMP31]], align 8
+// CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 9
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i8*
+// CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP33]], i8 0, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP34:%.*]] = load i64, i64* [[TMP31]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]], i8* [[TMP9]], i32 1, i64* [[TMP29]], i64* [[TMP30]], i64 [[TMP34]], i32 1, i32 0, i64 0, i8* bitcast (void (%struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates*, i32)* @.omp_task_dup. to i8*))
+// CHECK-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    br label [[OMP_IF_END]]
+// CHECK:       omp_if.end:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], %struct.S** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]], [2 x %struct.S]** noalias noundef [[TMP3:%.*]], [2 x i32]** noalias noundef [[TMP4:%.*]], i32** noalias noundef [[TMP5:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.S**, align 8
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i32**, align 8
+// CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca [2 x %struct.S]**, align 8
+// CHECK-NEXT:    [[DOTADDR4:%.*]] = alloca [2 x i32]**, align 8
+// CHECK-NEXT:    [[DOTADDR5:%.*]] = alloca i32**, align 8
+// CHECK-NEXT:    store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// CHECK-NEXT:    store %struct.S** [[TMP1]], %struct.S*** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store i32** [[TMP2]], i32*** [[DOTADDR2]], align 8
+// CHECK-NEXT:    store [2 x %struct.S]** [[TMP3]], [2 x %struct.S]*** [[DOTADDR3]], align 8
+// CHECK-NEXT:    store [2 x i32]** [[TMP4]], [2 x i32]*** [[DOTADDR4]], align 8
+// CHECK-NEXT:    store i32** [[TMP5]], i32*** [[DOTADDR5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP8:%.*]] = load [2 x %struct.S]**, [2 x %struct.S]*** [[DOTADDR3]], align 8
+// CHECK-NEXT:    store [2 x %struct.S]* [[TMP7]], [2 x %struct.S]** [[TMP8]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load %struct.S**, %struct.S*** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store %struct.S* [[TMP9]], %struct.S** [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32**, i32*** [[DOTADDR2]], align 8
+// CHECK-NEXT:    store i32* [[TMP11]], i32** [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP14:%.*]] = load [2 x i32]**, [2 x i32]*** [[DOTADDR4]], align 8
+// CHECK-NEXT:    store [2 x i32]* [[TMP13]], [2 x i32]** [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32**, i32*** [[DOTADDR5]], align 8
+// CHECK-NEXT:    store i32* [[TMP15]], i32** [[TMP16]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// CHECK-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca [2 x %struct.S]*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR3_I:%.*]] = alloca [2 x i32]*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR4_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// CHECK-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// CHECK-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, %struct.S**, i32**, [2 x %struct.S]**, [2 x i32]**, i32**)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !14
+// CHECK-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP22:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP23:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast void (i8*, ...)* [[TMP23]] to void (i8*, %struct.S**, i32**, [2 x %struct.S]**, [2 x i32]**, i32**)*
+// CHECK-NEXT:    call void [[TMP25]](i8* [[TMP24]], %struct.S** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], [2 x %struct.S]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], [2 x i32]** [[DOTFIRSTPRIV_PTR_ADDR3_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR4_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP26:%.*]] = load %struct.S*, %struct.S** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP28:%.*]] = load [2 x %struct.S]*, [2 x %struct.S]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP29:%.*]] = load [2 x i32]*, [2 x i32]** [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP30:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR4_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[TMP31:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP31]] to i32
+// CHECK-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// CHECK:       omp.inner.for.cond.i:
+// CHECK-NEXT:    [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15:![0-9]+]]
+// CHECK-NEXT:    [[CONV5_I:%.*]] = sext i32 [[TMP32]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV5_I]], [[TMP33]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
+// CHECK:       omp.inner.for.body.i:
+// CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    store i32 [[TMP34]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[TMP27]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[TMP29]], i64 0, i64 0
+// CHECK-NEXT:    store i32 [[TMP35]], i32* [[ARRAYIDX_I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP28]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast %struct.S* [[ARRAYIDX6_I]] to i8*
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast %struct.S* [[TMP26]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP36]], i8* align 8 [[TMP37]], i64 8, i1 false), !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    store i32 33, i32* [[TMP30]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    [[ADD7_I:%.*]] = add nsw i32 [[TMP38]], 1
+// CHECK-NEXT:    store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]]
+// CHECK:       .omp_outlined..1.exit:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_dup.
+// CHECK-SAME: (%struct.kmp_task_t_with_privates* noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noundef [[TMP1:%.*]], i32 noundef [[TMP2:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP0]], %struct.kmp_task_t_with_privates** [[DOTADDR]], align 8
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTADDR2]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP5]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP8]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP9]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = load [2 x %struct.S]*, [2 x %struct.S]** [[TMP11]], align 8
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x %struct.S]* [[TMP12]] to %struct.S*
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S* [[ARRAY_BEGIN]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE3:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK:       omp.arraycpy.body:
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP13]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[OMP_ARRAYCPY_SRCELEMENTPAST]], double noundef 0.000000e+00)
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK:       omp.arraycpy.done3:
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP8]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP9]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP17:%.*]] = load %struct.S*, %struct.S** [[TMP16]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP15]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP17]], double noundef 0.000000e+00)
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_destructor.
+// CHECK-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP2]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 1
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP5]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK:       arraydestroy.body:
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP6]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK:       arraydestroy.done2:
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP7]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdED1Ev
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIdED2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0]], align 4
+// CHECK-NEXT:    [[T_VAR:%.*]] = alloca i32, align 128
+// CHECK-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 4
+// CHECK-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]])
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
+// CHECK-NEXT:    store i32 0, i32* [[T_VAR]], align 128
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const._Z5tmainIiET_v.vec to i8*), i64 8, i1 false)
+// CHECK-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYINIT_BEGIN]], i64 1
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_VAR]], align 128
+// CHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[T_VAR_CASTED]] to i32*
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[CONV]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[T_VAR_CASTED]], align 8
+// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [2 x i32]*, i64, [2 x %struct.S.0]*, %struct.S.0*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), [2 x i32]* [[VEC]], i64 [[TMP2]], [2 x %struct.S.0]* [[S_ARR]], %struct.S.0* [[VAR]])
+// CHECK-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK:       arraydestroy.body:
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP3]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S.0* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK:       arraydestroy.done1:
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ev
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    store double 0.000000e+00, double* [[F]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC2ERKS0_d
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// CHECK-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// CHECK-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[F2]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[T_ADDR]], align 8
+// CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    store double [[ADD]], double* [[F]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ed
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// CHECK-NEXT:    store double [[TMP0]], double* [[F]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIdED2Ev
+// CHECK-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// CHECK-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIiEC2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_i
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// CHECK-NEXT:    call void @_ZN1SIiEC2ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP0]], i32 noundef [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    call void @_ZN1SIiEC2Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2
+// CHECK-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], [2 x i32]* noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], [2 x %struct.S.0]* noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca [2 x i32]*, align 8
+// CHECK-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[S_ARR_ADDR:%.*]] = alloca [2 x %struct.S.0]*, align 8
+// CHECK-NEXT:    [[VAR_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_1:%.*]], align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT:    store [2 x i32]* [[VEC]], [2 x i32]** [[VEC_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[T_VAR]], i64* [[T_VAR_ADDR]], align 8
+// CHECK-NEXT:    store [2 x %struct.S.0]* [[S_ARR]], [2 x %struct.S.0]** [[S_ARR_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S.0* [[VAR]], %struct.S.0** [[VAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i32]*, [2 x i32]** [[VEC_ADDR]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load [2 x %struct.S.0]*, [2 x %struct.S.0]** [[S_ARR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.S.0*, %struct.S.0** [[VAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+// CHECK-NEXT:    br i1 [[TMP6]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// CHECK:       omp_if.then:
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ANON_1]], %struct.anon.1* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x %struct.S.0]* [[TMP1]], [2 x %struct.S.0]** [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ANON_1]], %struct.anon.1* [[AGG_CAPTURED]], i32 0, i32 1
+// CHECK-NEXT:    store %struct.S.0* [[TMP2]], %struct.S.0** [[TMP8]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    [[TMP9:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]], i32 9, i64 256, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.2*)* @.omp_task_entry..5 to i32 (i32, i8*)*))
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to %struct.kmp_task_t_with_privates.2*
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 128
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.anon.1* [[AGG_CAPTURED]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP13]], i8* align 8 [[TMP14]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP10]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP15]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP16]], align 128
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP15]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast [2 x i32]* [[TMP18]] to i8*
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast [2 x i32]* [[TMP0]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP19]], i8* align 4 [[TMP20]], i64 8, i1 false)
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP15]], i32 0, i32 2
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP21]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast [2 x %struct.S.0]* [[TMP1]] to %struct.S.0*
+// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S.0* [[ARRAY_BEGIN]], [[TMP23]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE1:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK:       omp.arraycpy.body:
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP22]], [[OMP_IF_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN]], [[OMP_IF_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 noundef 0)
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP23]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE1]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK:       omp.arraycpy.done1:
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP15]], i32 0, i32 3
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP24]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP2]], i32 noundef 0)
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast %union.kmp_cmplrdata_t* [[TMP25]] to i32 (i32, i8*)**
+// CHECK-NEXT:    store i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.2*)* @.omp_task_destructor..7 to i32 (i32, i8*)*), i32 (i32, i8*)** [[TMP26]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 5
+// CHECK-NEXT:    store i64 0, i64* [[TMP27]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 6
+// CHECK-NEXT:    store i64 9, i64* [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 7
+// CHECK-NEXT:    store i64 1, i64* [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 9
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i8*
+// CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP31]], i8 0, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP32:%.*]] = load i64, i64* [[TMP29]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]], i8* [[TMP9]], i32 1, i64* [[TMP27]], i64* [[TMP28]], i64 [[TMP32]], i32 1, i32 0, i64 0, i8* bitcast (void (%struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2*, i32)* @.omp_task_dup..6 to i8*))
+// CHECK-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+// CHECK-NEXT:    br label [[OMP_IF_END]]
+// CHECK:       omp_if.end:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_privates_map..4
+// CHECK-SAME: (%struct..kmp_privates.t.3* noalias noundef [[TMP0:%.*]], i32** noalias noundef [[TMP1:%.*]], [2 x i32]** noalias noundef [[TMP2:%.*]], [2 x %struct.S.0]** noalias noundef [[TMP3:%.*]], %struct.S.0** noalias noundef [[TMP4:%.*]]) #[[ATTR6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t.3*, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32**, align 8
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca [2 x i32]**, align 8
+// CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca [2 x %struct.S.0]**, align 8
+// CHECK-NEXT:    [[DOTADDR4:%.*]] = alloca %struct.S.0**, align 8
+// CHECK-NEXT:    store %struct..kmp_privates.t.3* [[TMP0]], %struct..kmp_privates.t.3** [[DOTADDR]], align 8
+// CHECK-NEXT:    store i32** [[TMP1]], i32*** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store [2 x i32]** [[TMP2]], [2 x i32]*** [[DOTADDR2]], align 8
+// CHECK-NEXT:    store [2 x %struct.S.0]** [[TMP3]], [2 x %struct.S.0]*** [[DOTADDR3]], align 8
+// CHECK-NEXT:    store %struct.S.0** [[TMP4]], %struct.S.0*** [[DOTADDR4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load %struct..kmp_privates.t.3*, %struct..kmp_privates.t.3** [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP5]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32**, i32*** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store i32* [[TMP6]], i32** [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP5]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP9:%.*]] = load [2 x i32]**, [2 x i32]*** [[DOTADDR2]], align 8
+// CHECK-NEXT:    store [2 x i32]* [[TMP8]], [2 x i32]** [[TMP9]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP5]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP11:%.*]] = load [2 x %struct.S.0]**, [2 x %struct.S.0]*** [[DOTADDR3]], align 8
+// CHECK-NEXT:    store [2 x %struct.S.0]* [[TMP10]], [2 x %struct.S.0]** [[TMP11]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP13:%.*]] = load %struct.S.0**, %struct.S.0*** [[DOTADDR4]], align 8
+// CHECK-NEXT:    store %struct.S.0* [[TMP12]], %struct.S.0** [[TMP13]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_entry..5
+// CHECK-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.2* noalias noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// CHECK-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon.1*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca [2 x i32]*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca [2 x %struct.S.0]*, align 8
+// CHECK-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR3_I:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates.2* [[TMP1]], %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 128
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.1*
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t.3* [[TMP9]] to i8*
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates.2* [[TMP3]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// CHECK-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META26:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
+// CHECK-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t.3*, i32**, [2 x i32]**, [2 x %struct.S.0]**, %struct.S.0**)* @.omp_task_privates_map..4 to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !32
+// CHECK-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    store %struct.anon.1* [[TMP8]], %struct.anon.1** [[__CONTEXT_ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[TMP22:%.*]] = load %struct.anon.1*, %struct.anon.1** [[__CONTEXT_ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[TMP23:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast void (i8*, ...)* [[TMP23]] to void (i8*, i32**, [2 x i32]**, [2 x %struct.S.0]**, %struct.S.0**)*
+// CHECK-NEXT:    call void [[TMP25]](i8* [[TMP24]], i32** [[DOTFIRSTPRIV_PTR_ADDR_I]], [2 x i32]** [[DOTFIRSTPRIV_PTR_ADDR1_I]], [2 x %struct.S.0]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], %struct.S.0** [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[TMP27:%.*]] = load [2 x i32]*, [2 x i32]** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[TMP28:%.*]] = load [2 x %struct.S.0]*, [2 x %struct.S.0]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[TMP29:%.*]] = load %struct.S.0*, %struct.S.0** [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[TMP30:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !32
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP30]] to i32
+// CHECK-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !32
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// CHECK:       omp.inner.for.cond.i:
+// CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33:![0-9]+]]
+// CHECK-NEXT:    [[CONV4_I:%.*]] = sext i32 [[TMP31]] to i64
+// CHECK-NEXT:    [[TMP32:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !32, !llvm.access.group [[ACC_GRP33]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV4_I]], [[TMP32]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
+// CHECK:       omp.inner.for.body.i:
+// CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33]]
+// CHECK-NEXT:    store i32 [[TMP33]], i32* [[I_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33]]
+// CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[TMP26]], align 128, !llvm.access.group [[ACC_GRP33]]
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[TMP27]], i64 0, i64 0
+// CHECK-NEXT:    store i32 [[TMP34]], i32* [[ARRAYIDX_I]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP28]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast %struct.S.0* [[ARRAYIDX5_I]] to i8*
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast %struct.S.0* [[TMP29]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP35]], i8* align 4 [[TMP36]], i64 4, i1 false), !llvm.access.group [[ACC_GRP33]]
+// CHECK-NEXT:    [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33]]
+// CHECK-NEXT:    [[ADD6_I:%.*]] = add nsw i32 [[TMP37]], 1
+// CHECK-NEXT:    store i32 [[ADD6_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33]]
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP34:![0-9]+]]
+// CHECK:       .omp_outlined..3.exit:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_dup..6
+// CHECK-SAME: (%struct.kmp_task_t_with_privates.2* noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.2* noundef [[TMP1:%.*]], i32 noundef [[TMP2:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates.2* [[TMP0]], %struct.kmp_task_t_with_privates.2** [[DOTADDR]], align 8
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates.2* [[TMP1]], %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTADDR2]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP5]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 128
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.1*
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP8]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANON_1:%.*]], %struct.anon.1* [[TMP9]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = load [2 x %struct.S.0]*, [2 x %struct.S.0]** [[TMP11]], align 8
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x %struct.S.0]* [[TMP12]] to %struct.S.0*
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq %struct.S.0* [[ARRAY_BEGIN]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE3:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK:       omp.arraycpy.body:
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP13]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 noundef 0)
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK:       omp.arraycpy.done3:
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP8]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON_1]], %struct.anon.1* [[TMP9]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP17:%.*]] = load %struct.S.0*, %struct.S.0** [[TMP16]], align 8
+// CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP15]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP17]], i32 noundef 0)
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_task_destructor..7
+// CHECK-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.2* noalias noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-NEXT:    store %struct.kmp_task_t_with_privates.2* [[TMP1]], %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP2]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP3]], i32 0, i32 3
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP5]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// CHECK-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK:       arraydestroy.body:
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP6]], [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S.0* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK:       arraydestroy.done2:
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP7]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN1SIiED2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    store i32 0, i32* [[F]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_i
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// CHECK-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[F2]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    store i32 [[ADD]], i32* [[F]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], i32* [[F]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// CHECK-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// CHECK-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// LAMBDA-LABEL: define {{[^@]+}}@main
+// LAMBDA-SAME: () #[[ATTR0:[0-9]+]] {
+// LAMBDA-NEXT:  entry:
+// LAMBDA-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 1
+// LAMBDA-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// LAMBDA-NEXT:    call void @"_ZZ4mainENK3$_0clEv"(%class.anon* noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// LAMBDA-NEXT:    ret i32 0
+//
+//
+// LAMBDA-LABEL: define {{[^@]+}}@.omp_outlined.
+// LAMBDA-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// LAMBDA-NEXT:  entry:
+// LAMBDA-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// LAMBDA-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// LAMBDA-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// LAMBDA-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// LAMBDA-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// LAMBDA-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// LAMBDA-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// LAMBDA-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]])
+// LAMBDA-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+// LAMBDA-NEXT:    br i1 [[TMP3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// LAMBDA:       omp_if.then:
+// LAMBDA-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// LAMBDA-NEXT:    [[TMP4:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// LAMBDA-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct.kmp_task_t_with_privates*
+// LAMBDA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP5]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP5]], i32 0, i32 1
+// LAMBDA-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP7]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP9:%.*]] = load volatile double, double* @g, align 8
+// LAMBDA-NEXT:    store volatile double [[TMP9]], double* [[TMP8]], align 8
+// LAMBDA-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP7]], i32 0, i32 1
+// LAMBDA-NEXT:    [[TMP11:%.*]] = load i32, i32* @_ZZ4mainE5sivar, align 4
+// LAMBDA-NEXT:    store i32 [[TMP11]], i32* [[TMP10]], align 8
+// LAMBDA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 5
+// LAMBDA-NEXT:    store i64 0, i64* [[TMP12]], align 8
+// LAMBDA-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 6
+// LAMBDA-NEXT:    store i64 9, i64* [[TMP13]], align 8
+// LAMBDA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 7
+// LAMBDA-NEXT:    store i64 1, i64* [[TMP14]], align 8
+// LAMBDA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 9
+// LAMBDA-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i8*
+// LAMBDA-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP16]], i8 0, i64 8, i1 false)
+// LAMBDA-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP14]], align 8
+// LAMBDA-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP4]], i32 1, i64* [[TMP12]], i64* [[TMP13]], i64 [[TMP17]], i32 1, i32 0, i64 0, i8* null)
+// LAMBDA-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// LAMBDA-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// LAMBDA-NEXT:    br label [[OMP_IF_END]]
+// LAMBDA:       omp_if.end:
+// LAMBDA-NEXT:    ret void
+//
+//
+// LAMBDA-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// LAMBDA-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], double** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]]) #[[ATTR5:[0-9]+]] {
+// LAMBDA-NEXT:  entry:
+// LAMBDA-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// LAMBDA-NEXT:    [[DOTADDR1:%.*]] = alloca double**, align 8
+// LAMBDA-NEXT:    [[DOTADDR2:%.*]] = alloca i32**, align 8
+// LAMBDA-NEXT:    store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// LAMBDA-NEXT:    store double** [[TMP1]], double*** [[DOTADDR1]], align 8
+// LAMBDA-NEXT:    store i32** [[TMP2]], i32*** [[DOTADDR2]], align 8
+// LAMBDA-NEXT:    [[TMP3:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// LAMBDA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP5:%.*]] = load double**, double*** [[DOTADDR1]], align 8
+// LAMBDA-NEXT:    store double* [[TMP4]], double** [[TMP5]], align 8
+// LAMBDA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 1
+// LAMBDA-NEXT:    [[TMP7:%.*]] = load i32**, i32*** [[DOTADDR2]], align 8
+// LAMBDA-NEXT:    store i32* [[TMP6]], i32** [[TMP7]], align 8
+// LAMBDA-NEXT:    ret void
+//
+//
+// LAMBDA-LABEL: define {{[^@]+}}@.omp_task_entry.
+// LAMBDA-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// LAMBDA-NEXT:  entry:
+// LAMBDA-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// LAMBDA-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// LAMBDA-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// LAMBDA-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// LAMBDA-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// LAMBDA-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// LAMBDA-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// LAMBDA-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// LAMBDA-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// LAMBDA-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca double*, align 8
+// LAMBDA-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 8
+// LAMBDA-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[REF_TMP_I:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8
+// LAMBDA-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// LAMBDA-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// LAMBDA-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// LAMBDA-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// LAMBDA-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// LAMBDA-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// LAMBDA-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// LAMBDA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// LAMBDA-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// LAMBDA-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// LAMBDA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// LAMBDA-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// LAMBDA-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// LAMBDA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// LAMBDA-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// LAMBDA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// LAMBDA-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// LAMBDA-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// LAMBDA-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// LAMBDA-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// LAMBDA-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 8
+// LAMBDA-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// LAMBDA-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// LAMBDA-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// LAMBDA-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, double**, i32**)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP22:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP23:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP25:%.*]] = bitcast void (i8*, ...)* [[TMP23]] to void (i8*, double**, i32**)*
+// LAMBDA-NEXT:    call void [[TMP25]](i8* [[TMP24]], double** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]]) #[[ATTR3:[0-9]+]]
+// LAMBDA-NEXT:    [[TMP26:%.*]] = load double*, double** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[TMP28:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// LAMBDA-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP28]] to i32
+// LAMBDA-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// LAMBDA-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// LAMBDA:       omp.inner.for.cond.i:
+// LAMBDA-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15:![0-9]+]]
+// LAMBDA-NEXT:    [[CONV2_I:%.*]] = sext i32 [[TMP29]] to i64
+// LAMBDA-NEXT:    [[TMP30:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV2_I]], [[TMP30]]
+// LAMBDA-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
+// LAMBDA:       omp.inner.for.body.i:
+// LAMBDA-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    store i32 [[TMP31]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    store double 1.000000e+00, double* [[TMP26]], align 8, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    store i32 11, i32* [[TMP27]], align 4, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP_I]], i32 0, i32 0
+// LAMBDA-NEXT:    store double* [[TMP26]], double** [[TMP32]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP_I]], i32 0, i32 1
+// LAMBDA-NEXT:    store i32* [[TMP27]], i32** [[TMP33]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]]), !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    [[ADD3_I:%.*]] = add nsw i32 [[TMP34]], 1
+// LAMBDA-NEXT:    store i32 [[ADD3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// LAMBDA-NEXT:    br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]]
+// LAMBDA:       .omp_outlined..1.exit:
+// LAMBDA-NEXT:    ret i32 0
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@main
+// BLOCKS-SAME: () #[[ATTR1:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// BLOCKS-NEXT:    [[TMP0:%.*]] = load i8*, i8** getelementptr inbounds ([[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to %struct.__block_literal_generic*), i32 0, i32 3), align 8
+// BLOCKS-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to void (i8*)*
+// BLOCKS-NEXT:    call void [[TMP1]](i8* noundef bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to i8*))
+// BLOCKS-NEXT:    ret i32 0
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@__main_block_invoke
+// BLOCKS-SAME: (i8* noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[BLOCK_ADDR:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*, align 8
+// BLOCKS-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// BLOCKS-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*
+// BLOCKS-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>** [[BLOCK_ADDR]], align 8
+// BLOCKS-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
+// BLOCKS-NEXT:    ret void
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@.omp_outlined.
+// BLOCKS-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// BLOCKS-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// BLOCKS-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// BLOCKS-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// BLOCKS-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// BLOCKS-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// BLOCKS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// BLOCKS-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// BLOCKS-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+// BLOCKS-NEXT:    br i1 [[TMP3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// BLOCKS:       omp_if.then:
+// BLOCKS-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// BLOCKS-NEXT:    [[TMP4:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// BLOCKS-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct.kmp_task_t_with_privates*
+// BLOCKS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP5]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP5]], i32 0, i32 1
+// BLOCKS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP7]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP9:%.*]] = load volatile double, double* @g, align 8
+// BLOCKS-NEXT:    store volatile double [[TMP9]], double* [[TMP8]], align 8
+// BLOCKS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP7]], i32 0, i32 1
+// BLOCKS-NEXT:    [[TMP11:%.*]] = load i32, i32* @_ZZ4mainE5sivar, align 4
+// BLOCKS-NEXT:    store i32 [[TMP11]], i32* [[TMP10]], align 8
+// BLOCKS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 5
+// BLOCKS-NEXT:    store i64 0, i64* [[TMP12]], align 8
+// BLOCKS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 6
+// BLOCKS-NEXT:    store i64 9, i64* [[TMP13]], align 8
+// BLOCKS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 7
+// BLOCKS-NEXT:    store i64 1, i64* [[TMP14]], align 8
+// BLOCKS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP6]], i32 0, i32 9
+// BLOCKS-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i8*
+// BLOCKS-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP16]], i8 0, i64 8, i1 false)
+// BLOCKS-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP14]], align 8
+// BLOCKS-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP4]], i32 1, i64* [[TMP12]], i64* [[TMP13]], i64 [[TMP17]], i32 1, i32 0, i64 0, i8* null)
+// BLOCKS-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// BLOCKS-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]])
+// BLOCKS-NEXT:    br label [[OMP_IF_END]]
+// BLOCKS:       omp_if.end:
+// BLOCKS-NEXT:    ret void
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@_block_invoke
+// BLOCKS-SAME: (i8* noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[BLOCK_ADDR:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>*, align 8
+// BLOCKS-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// BLOCKS-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>*
+// BLOCKS-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>** [[BLOCK_ADDR]], align 8
+// BLOCKS-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], i32 0, i32 5
+// BLOCKS-NEXT:    store double 2.000000e+00, double* [[BLOCK_CAPTURE_ADDR]], align 8
+// BLOCKS-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], i32 0, i32 6
+// BLOCKS-NEXT:    store i32 22, i32* [[BLOCK_CAPTURE_ADDR1]], align 8
+// BLOCKS-NEXT:    ret void
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// BLOCKS-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], double** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]]) #[[ATTR6:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// BLOCKS-NEXT:    [[DOTADDR1:%.*]] = alloca double**, align 8
+// BLOCKS-NEXT:    [[DOTADDR2:%.*]] = alloca i32**, align 8
+// BLOCKS-NEXT:    store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// BLOCKS-NEXT:    store double** [[TMP1]], double*** [[DOTADDR1]], align 8
+// BLOCKS-NEXT:    store i32** [[TMP2]], i32*** [[DOTADDR2]], align 8
+// BLOCKS-NEXT:    [[TMP3:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// BLOCKS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP5:%.*]] = load double**, double*** [[DOTADDR1]], align 8
+// BLOCKS-NEXT:    store double* [[TMP4]], double** [[TMP5]], align 8
+// BLOCKS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 1
+// BLOCKS-NEXT:    [[TMP7:%.*]] = load i32**, i32*** [[DOTADDR2]], align 8
+// BLOCKS-NEXT:    store i32* [[TMP6]], i32** [[TMP7]], align 8
+// BLOCKS-NEXT:    ret void
+//
+//
+// BLOCKS-LABEL: define {{[^@]+}}@.omp_task_entry.
+// BLOCKS-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// BLOCKS-NEXT:  entry:
+// BLOCKS-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// BLOCKS-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// BLOCKS-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// BLOCKS-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// BLOCKS-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// BLOCKS-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// BLOCKS-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// BLOCKS-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca double*, align 8
+// BLOCKS-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 8
+// BLOCKS-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[BLOCK_I:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, align 8
+// BLOCKS-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// BLOCKS-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// BLOCKS-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// BLOCKS-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// BLOCKS-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// BLOCKS-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// BLOCKS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// BLOCKS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// BLOCKS-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// BLOCKS-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// BLOCKS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// BLOCKS-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// BLOCKS-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// BLOCKS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// BLOCKS-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// BLOCKS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// BLOCKS-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// BLOCKS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// BLOCKS-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// BLOCKS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// BLOCKS-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 8
+// BLOCKS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// BLOCKS-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// BLOCKS-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// BLOCKS-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, double**, i32**)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP22:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP23:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP25:%.*]] = bitcast void (i8*, ...)* [[TMP23]] to void (i8*, double**, i32**)*
+// BLOCKS-NEXT:    call void [[TMP25]](i8* [[TMP24]], double** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]]) #[[ATTR4:[0-9]+]]
+// BLOCKS-NEXT:    [[TMP26:%.*]] = load double*, double** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[TMP28:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// BLOCKS-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP28]] to i32
+// BLOCKS-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// BLOCKS-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// BLOCKS:       omp.inner.for.cond.i:
+// BLOCKS-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15:![0-9]+]]
+// BLOCKS-NEXT:    [[CONV2_I:%.*]] = sext i32 [[TMP29]] to i64
+// BLOCKS-NEXT:    [[TMP30:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV2_I]], [[TMP30]]
+// BLOCKS-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
+// BLOCKS:       omp.inner.for.body.i:
+// BLOCKS-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    store i32 [[TMP31]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    store double 1.000000e+00, double* [[TMP26]], align 8, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    store i32 11, i32* [[TMP27]], align 4, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[BLOCK_ISA_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 0
+// BLOCKS-NEXT:    store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** [[BLOCK_ISA_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[BLOCK_FLAGS_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 1
+// BLOCKS-NEXT:    store i32 1073741824, i32* [[BLOCK_FLAGS_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[BLOCK_RESERVED_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 2
+// BLOCKS-NEXT:    store i32 0, i32* [[BLOCK_RESERVED_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[BLOCK_INVOKE_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 3
+// BLOCKS-NEXT:    store i8* bitcast (void (i8*)* @_block_invoke to i8*), i8** [[BLOCK_INVOKE_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[BLOCK_DESCRIPTOR_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 4
+// BLOCKS-NEXT:    store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8* }* @__block_descriptor_tmp.2 to %struct.__block_descriptor*), %struct.__block_descriptor** [[BLOCK_DESCRIPTOR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[BLOCK_CAPTURED_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 5
+// BLOCKS-NEXT:    [[TMP32:%.*]] = load volatile double, double* [[TMP26]], align 8, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    store volatile double [[TMP32]], double* [[BLOCK_CAPTURED_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[BLOCK_CAPTURED3_I:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]], i32 0, i32 6
+// BLOCKS-NEXT:    [[TMP33:%.*]] = load i32, i32* [[TMP27]], align 4, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    store i32 [[TMP33]], i32* [[BLOCK_CAPTURED3_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[TMP34:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK_I]] to void ()*
+// BLOCKS-NEXT:    [[BLOCK_LITERAL_I:%.*]] = bitcast void ()* [[TMP34]] to %struct.__block_literal_generic*
+// BLOCKS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* [[BLOCK_LITERAL_I]], i32 0, i32 3
+// BLOCKS-NEXT:    [[TMP36:%.*]] = bitcast %struct.__block_literal_generic* [[BLOCK_LITERAL_I]] to i8*
+// BLOCKS-NEXT:    [[TMP37:%.*]] = load i8*, i8** [[TMP35]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[TMP38:%.*]] = bitcast i8* [[TMP37]] to void (i8*)*
+// BLOCKS-NEXT:    call void [[TMP38]](i8* noundef [[TMP36]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[TMP39:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    [[ADD4_I:%.*]] = add nsw i32 [[TMP39]], 1
+// BLOCKS-NEXT:    store i32 [[ADD4_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// BLOCKS-NEXT:    br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]]
+// BLOCKS:       .omp_outlined..1.exit:
+// BLOCKS-NEXT:    ret i32 0
+//
+//
+// ARRAY-LABEL: define {{[^@]+}}@_Z10array_funciPfP2St
+// ARRAY-SAME: (i32 noundef [[N:%.*]], float* noundef [[A:%.*]], %struct.St* noundef [[S:%.*]]) #[[ATTR0:[0-9]+]] {
+// ARRAY-NEXT:  entry:
+// ARRAY-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// ARRAY-NEXT:    [[S_ADDR:%.*]] = alloca %struct.St*, align 8
+// ARRAY-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// ARRAY-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// ARRAY-NEXT:    store %struct.St* [[S]], %struct.St** [[S_ADDR]], align 8
+// ARRAY-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// ARRAY-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// ARRAY-NEXT:    [[TMP2:%.*]] = load float*, float** [[A_ADDR]], align 8
+// ARRAY-NEXT:    [[TMP3:%.*]] = load %struct.St*, %struct.St** [[S_ADDR]], align 8
+// ARRAY-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, float*, %struct.St*)* @.omp_outlined. to void (i32*, i32*, ...)*), i64 [[TMP1]], float* [[TMP2]], %struct.St* [[TMP3]])
+// ARRAY-NEXT:    ret void
+//
+//
+// ARRAY-LABEL: define {{[^@]+}}@.omp_outlined.
+// ARRAY-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], float* noundef [[A:%.*]], %struct.St* noundef [[S:%.*]]) #[[ATTR1:[0-9]+]] {
+// ARRAY-NEXT:  entry:
+// ARRAY-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// ARRAY-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// ARRAY-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// ARRAY-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// ARRAY-NEXT:    [[S_ADDR:%.*]] = alloca %struct.St*, align 8
+// ARRAY-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// ARRAY-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// ARRAY-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// ARRAY-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// ARRAY-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// ARRAY-NEXT:    store %struct.St* [[S]], %struct.St** [[S_ADDR]], align 8
+// ARRAY-NEXT:    [[TMP0:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// ARRAY-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// ARRAY-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// ARRAY-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+// ARRAY-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+// ARRAY-NEXT:    br i1 [[TMP4]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]]
+// ARRAY:       omp_if.then:
+// ARRAY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// ARRAY-NEXT:    store i64 [[TMP0]], i64* [[TMP5]], align 8
+// ARRAY-NEXT:    call void @__kmpc_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+// ARRAY-NEXT:    [[TMP6:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i64 96, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// ARRAY-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct.kmp_task_t_with_privates*
+// ARRAY-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP7]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
+// ARRAY-NEXT:    [[TMP11:%.*]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
+// ARRAY-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 8, i1 false)
+// ARRAY-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP7]], i32 0, i32 1
+// ARRAY-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP12]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// ARRAY-NEXT:    store float* [[TMP14]], float** [[TMP13]], align 8
+// ARRAY-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP12]], i32 0, i32 1
+// ARRAY-NEXT:    [[TMP16:%.*]] = load %struct.St*, %struct.St** [[S_ADDR]], align 8
+// ARRAY-NEXT:    store %struct.St* [[TMP16]], %struct.St** [[TMP15]], align 8
+// ARRAY-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 5
+// ARRAY-NEXT:    store i64 0, i64* [[TMP17]], align 8
+// ARRAY-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 6
+// ARRAY-NEXT:    store i64 9, i64* [[TMP18]], align 8
+// ARRAY-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 7
+// ARRAY-NEXT:    store i64 1, i64* [[TMP19]], align 8
+// ARRAY-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP8]], i32 0, i32 9
+// ARRAY-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i8*
+// ARRAY-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP21]], i8 0, i64 8, i1 false)
+// ARRAY-NEXT:    [[TMP22:%.*]] = load i64, i64* [[TMP19]], align 8
+// ARRAY-NEXT:    call void @__kmpc_taskloop(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i8* [[TMP6]], i32 1, i64* [[TMP17]], i64* [[TMP18]], i64 [[TMP22]], i32 1, i32 0, i64 0, i8* null)
+// ARRAY-NEXT:    call void @__kmpc_end_taskgroup(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+// ARRAY-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+// ARRAY-NEXT:    br label [[OMP_IF_END]]
+// ARRAY:       omp_if.end:
+// ARRAY-NEXT:    ret void
+//
+//
+// ARRAY-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// ARRAY-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], float*** noalias noundef [[TMP1:%.*]], %struct.St*** noalias noundef [[TMP2:%.*]]) #[[ATTR4:[0-9]+]] {
+// ARRAY-NEXT:  entry:
+// ARRAY-NEXT:    [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// ARRAY-NEXT:    [[DOTADDR1:%.*]] = alloca float***, align 8
+// ARRAY-NEXT:    [[DOTADDR2:%.*]] = alloca %struct.St***, align 8
+// ARRAY-NEXT:    store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// ARRAY-NEXT:    store float*** [[TMP1]], float**** [[DOTADDR1]], align 8
+// ARRAY-NEXT:    store %struct.St*** [[TMP2]], %struct.St**** [[DOTADDR2]], align 8
+// ARRAY-NEXT:    [[TMP3:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// ARRAY-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP5:%.*]] = load float***, float**** [[DOTADDR1]], align 8
+// ARRAY-NEXT:    store float** [[TMP4]], float*** [[TMP5]], align 8
+// ARRAY-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP3]], i32 0, i32 1
+// ARRAY-NEXT:    [[TMP7:%.*]] = load %struct.St***, %struct.St**** [[DOTADDR2]], align 8
+// ARRAY-NEXT:    store %struct.St** [[TMP6]], %struct.St*** [[TMP7]], align 8
+// ARRAY-NEXT:    ret void
+//
+//
+// ARRAY-LABEL: define {{[^@]+}}@.omp_task_entry.
+// ARRAY-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// ARRAY-NEXT:  entry:
+// ARRAY-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// ARRAY-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// ARRAY-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// ARRAY-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// ARRAY-NEXT:    [[DOTLB__ADDR_I:%.*]] = alloca i64, align 8
+// ARRAY-NEXT:    [[DOTUB__ADDR_I:%.*]] = alloca i64, align 8
+// ARRAY-NEXT:    [[DOTST__ADDR_I:%.*]] = alloca i64, align 8
+// ARRAY-NEXT:    [[DOTLITER__ADDR_I:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTREDUCTIONS__ADDR_I:%.*]] = alloca i8*, align 8
+// ARRAY-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// ARRAY-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca float**, align 8
+// ARRAY-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca %struct.St**, align 8
+// ARRAY-NEXT:    [[I_I:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTOMP_IV_I:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// ARRAY-NEXT:    [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// ARRAY-NEXT:    store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// ARRAY-NEXT:    store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// ARRAY-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// ARRAY-NEXT:    [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// ARRAY-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// ARRAY-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// ARRAY-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// ARRAY-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 1
+// ARRAY-NEXT:    [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// ARRAY-NEXT:    [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// ARRAY-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 5
+// ARRAY-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// ARRAY-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 6
+// ARRAY-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// ARRAY-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 7
+// ARRAY-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// ARRAY-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 8
+// ARRAY-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 8
+// ARRAY-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 9
+// ARRAY-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// ARRAY-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// ARRAY-NEXT:    store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// ARRAY-NEXT:    store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, float***, %struct.St***)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i64 [[TMP13]], i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i64 [[TMP15]], i64* [[DOTUB__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i64 [[TMP17]], i64* [[DOTST__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store i32 [[TMP19]], i32* [[DOTLITER__ADDR_I]], align 4, !noalias !14
+// ARRAY-NEXT:    store i8* [[TMP21]], i8** [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP22:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP22]], i32 0, i32 0
+// ARRAY-NEXT:    [[TMP24:%.*]] = load i64, i64* [[TMP23]], align 8
+// ARRAY-NEXT:    [[TMP25:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP26:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP27:%.*]] = bitcast void (i8*, ...)* [[TMP25]] to void (i8*, float***, %struct.St***)*
+// ARRAY-NEXT:    call void [[TMP27]](i8* [[TMP26]], float*** [[DOTFIRSTPRIV_PTR_ADDR_I]], %struct.St*** [[DOTFIRSTPRIV_PTR_ADDR1_I]]) #[[ATTR2:[0-9]+]]
+// ARRAY-NEXT:    [[TMP28:%.*]] = load float**, float*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP29:%.*]] = load %struct.St**, %struct.St*** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[TMP30:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
+// ARRAY-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP30]] to i32
+// ARRAY-NEXT:    store i32 [[CONV_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
+// ARRAY-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
+// ARRAY:       omp.inner.for.cond.i:
+// ARRAY-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15:![0-9]+]]
+// ARRAY-NEXT:    [[CONV2_I:%.*]] = sext i32 [[TMP31]] to i64
+// ARRAY-NEXT:    [[TMP32:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// ARRAY-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV2_I]], [[TMP32]]
+// ARRAY-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
+// ARRAY:       omp.inner.for.body.i:
+// ARRAY-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// ARRAY-NEXT:    store i32 [[TMP33]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// ARRAY-NEXT:    [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// ARRAY-NEXT:    [[ADD3_I:%.*]] = add nsw i32 [[TMP34]], 1
+// ARRAY-NEXT:    store i32 [[ADD3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// ARRAY-NEXT:    br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]]
+// ARRAY:       .omp_outlined..1.exit:
+// ARRAY-NEXT:    ret i32 0
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@main
+// SIMD-ONLY0-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S:%.*]], align 8
+// SIMD-ONLY0-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S]], align 8
+// SIMD-ONLY0-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// SIMD-ONLY0-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S], align 16
+// SIMD-ONLY0-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// SIMD-ONLY0-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// SIMD-ONLY0-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]])
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[T_VAR]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const.main.vec to i8*), i64 8, i1 false)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
+// SIMD-ONLY0-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY0-NEXT:    store i64 9, i64* [[DOTOMP_UB]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY0-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// SIMD-ONLY0-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4
+// SIMD-ONLY0-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// SIMD-ONLY0:       omp.inner.for.cond:
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]]
+// SIMD-ONLY0-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP2]] to i64
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV1]], [[TMP3]]
+// SIMD-ONLY0-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// SIMD-ONLY0:       omp.inner.for.body:
+// SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[T_VAR]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[VEC]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    store i32 [[TMP5]], i32* [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
+// SIMD-ONLY0-NEXT:    [[TMP7:%.*]] = bitcast %struct.S* [[VAR]] to i8*
+// SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 8 [[TMP7]], i64 8, i1 false), !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    store i32 33, i32* @_ZZ4mainE5sivar, align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// SIMD-ONLY0:       omp.body.continue:
+// SIMD-ONLY0-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// SIMD-ONLY0:       omp.inner.for.inc:
+// SIMD-ONLY0-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY0-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// SIMD-ONLY0:       omp.inner.for.end:
+// SIMD-ONLY0-NEXT:    store i32 10, i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// SIMD-ONLY0-NEXT:    store i32 [[CALL]], i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]]) #[[ATTR4:[0-9]+]]
+// SIMD-ONLY0-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// SIMD-ONLY0-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// SIMD-ONLY0:       arraydestroy.body:
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP9]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// SIMD-ONLY0-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE4:%.*]], label [[ARRAYDESTROY_BODY]]
+// SIMD-ONLY0:       arraydestroy.done4:
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[TMP10:%.*]] = load i32, i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP10]]
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ev
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC1ERKS0_d
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load double, double* [[T_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC2ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP0]], double noundef [[TMP1]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ed
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC2Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], double noundef [[TMP0]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// SIMD-ONLY0-SAME: () #[[ATTR3:[0-9]+]] {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// SIMD-ONLY0-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0]], align 4
+// SIMD-ONLY0-NEXT:    [[T_VAR:%.*]] = alloca i32, align 128
+// SIMD-ONLY0-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// SIMD-ONLY0-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// SIMD-ONLY0-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// SIMD-ONLY0-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// SIMD-ONLY0-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]])
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[T_VAR]], align 128
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const._Z5tmainIiET_v.vec to i8*), i64 8, i1 false)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
+// SIMD-ONLY0-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY0-NEXT:    store i64 9, i64* [[DOTOMP_UB]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY0-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// SIMD-ONLY0-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4
+// SIMD-ONLY0-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// SIMD-ONLY0:       omp.inner.for.cond:
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6:![0-9]+]]
+// SIMD-ONLY0-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP2]] to i64
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY0-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV1]], [[TMP3]]
+// SIMD-ONLY0-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// SIMD-ONLY0:       omp.inner.for.body:
+// SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY0-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[T_VAR]], align 128, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[VEC]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    store i32 [[TMP5]], i32* [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = bitcast %struct.S.0* [[ARRAYIDX2]] to i8*
+// SIMD-ONLY0-NEXT:    [[TMP7:%.*]] = bitcast %struct.S.0* [[VAR]] to i8*
+// SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP6]], i8* align 4 [[TMP7]], i64 4, i1 false), !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY0-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// SIMD-ONLY0:       omp.body.continue:
+// SIMD-ONLY0-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// SIMD-ONLY0:       omp.inner.for.inc:
+// SIMD-ONLY0-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY0-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY0-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// SIMD-ONLY0:       omp.inner.for.end:
+// SIMD-ONLY0-NEXT:    store i32 10, i32* [[I]], align 4
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// SIMD-ONLY0-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// SIMD-ONLY0:       arraydestroy.body:
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP9]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S.0* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// SIMD-ONLY0-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE4:%.*]], label [[ARRAYDESTROY_BODY]]
+// SIMD-ONLY0:       arraydestroy.done4:
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    [[TMP10:%.*]] = load i32, i32* [[RETVAL]], align 4
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP10]]
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdED1Ev
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdED2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ev
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    store double 0.000000e+00, double* [[F]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdED2Ev
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC2ERKS0_d
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP0]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load double, double* [[F2]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load double, double* [[T_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
+// SIMD-ONLY0-NEXT:    store double [[ADD]], double* [[F]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ed
+// SIMD-ONLY0-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store double [[TMP0]], double* [[F]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_i
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC2ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP0]], i32 noundef [[TMP1]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC2Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiED2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[F]], align 4
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_i
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[TMP0]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[F2]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], i32* [[F]], align 4
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store i32 [[TMP0]], i32* [[F]], align 4
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// SIMD-ONLY0-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@main
+// SIMD-ONLY1-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S:%.*]], align 8
+// SIMD-ONLY1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S]], align 8
+// SIMD-ONLY1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// SIMD-ONLY1-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S], align 16
+// SIMD-ONLY1-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// SIMD-ONLY1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// SIMD-ONLY1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]])
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[T_VAR]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const.main.vec to i8*), i64 8, i1 false)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
+// SIMD-ONLY1-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY1-NEXT:    store i64 9, i64* [[DOTOMP_UB]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY1-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// SIMD-ONLY1-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4
+// SIMD-ONLY1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// SIMD-ONLY1:       omp.inner.for.cond:
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]]
+// SIMD-ONLY1-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP2]] to i64
+// SIMD-ONLY1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV1]], [[TMP3]]
+// SIMD-ONLY1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// SIMD-ONLY1:       omp.inner.for.body:
+// SIMD-ONLY1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// SIMD-ONLY1-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[T_VAR]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[VEC]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    store i32 [[TMP5]], i32* [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    [[TMP6:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
+// SIMD-ONLY1-NEXT:    [[TMP7:%.*]] = bitcast %struct.S* [[VAR]] to i8*
+// SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 8 [[TMP7]], i64 8, i1 false), !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    store i32 33, i32* @_ZZ4mainE5sivar, align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// SIMD-ONLY1:       omp.body.continue:
+// SIMD-ONLY1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// SIMD-ONLY1:       omp.inner.for.inc:
+// SIMD-ONLY1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// SIMD-ONLY1:       omp.inner.for.end:
+// SIMD-ONLY1-NEXT:    store i32 10, i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// SIMD-ONLY1-NEXT:    store i32 [[CALL]], i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[VAR]]) #[[ATTR4:[0-9]+]]
+// SIMD-ONLY1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[S_ARR]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAY_BEGIN]], i64 2
+// SIMD-ONLY1-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// SIMD-ONLY1:       arraydestroy.body:
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S* [ [[TMP9]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// SIMD-ONLY1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE4:%.*]], label [[ARRAYDESTROY_BODY]]
+// SIMD-ONLY1:       arraydestroy.done4:
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TEST]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED1Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[TTT]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    ret i32 [[TMP10]]
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ev
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC1ERKS0_d
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load double, double* [[T_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC2ERKS0_d(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[TMP0]], double noundef [[TMP1]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC1Ed
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC2Ed(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]], double noundef [[TMP0]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// SIMD-ONLY1-SAME: () #[[ATTR3:[0-9]+]] {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[TTT:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// SIMD-ONLY1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0]], align 4
+// SIMD-ONLY1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 128
+// SIMD-ONLY1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
+// SIMD-ONLY1-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// SIMD-ONLY1-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// SIMD-ONLY1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// SIMD-ONLY1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]])
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[T_VAR]], align 128
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[VEC]] to i8*
+// SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([2 x i32]* @__const._Z5tmainIiET_v.vec to i8*), i64 8, i1 false)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
+// SIMD-ONLY1-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY1-NEXT:    store i64 9, i64* [[DOTOMP_UB]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY1-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// SIMD-ONLY1-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4
+// SIMD-ONLY1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// SIMD-ONLY1:       omp.inner.for.cond:
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6:![0-9]+]]
+// SIMD-ONLY1-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP2]] to i64
+// SIMD-ONLY1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY1-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV1]], [[TMP3]]
+// SIMD-ONLY1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// SIMD-ONLY1:       omp.inner.for.body:
+// SIMD-ONLY1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// SIMD-ONLY1-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[T_VAR]], align 128, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[VEC]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    store i32 [[TMP5]], i32* [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i64 0, i64 0
+// SIMD-ONLY1-NEXT:    [[TMP6:%.*]] = bitcast %struct.S.0* [[ARRAYIDX2]] to i8*
+// SIMD-ONLY1-NEXT:    [[TMP7:%.*]] = bitcast %struct.S.0* [[VAR]] to i8*
+// SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP6]], i8* align 4 [[TMP7]], i64 4, i1 false), !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// SIMD-ONLY1:       omp.body.continue:
+// SIMD-ONLY1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// SIMD-ONLY1:       omp.inner.for.inc:
+// SIMD-ONLY1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// SIMD-ONLY1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// SIMD-ONLY1:       omp.inner.for.end:
+// SIMD-ONLY1-NEXT:    store i32 10, i32* [[I]], align 4
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[S_ARR]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAY_BEGIN]], i64 2
+// SIMD-ONLY1-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
+// SIMD-ONLY1:       arraydestroy.body:
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi %struct.S.0* [ [[TMP9]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq %struct.S.0* [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// SIMD-ONLY1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE4:%.*]], label [[ARRAYDESTROY_BODY]]
+// SIMD-ONLY1:       arraydestroy.done4:
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED1Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TTT]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[RETVAL]], align 4
+// SIMD-ONLY1-NEXT:    ret i32 [[TMP10]]
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdED1Ev
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdED2Ev(%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ev
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    store double 0.000000e+00, double* [[F]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdED2Ev
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC2ERKS0_d
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], %struct.S* noundef nonnull align 8 dereferenceable(8) [[S:%.*]], double noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[T_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[S]], %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[T]], double* [[T_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load %struct.S*, %struct.S** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP0]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load double, double* [[F2]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load double, double* [[T_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
+// SIMD-ONLY1-NEXT:    store double [[ADD]], double* [[F]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIdEC2Ed
+// SIMD-ONLY1-SAME: (%struct.S* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], double noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S*, align 8
+// SIMD-ONLY1-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S* [[THIS]], %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[A]], double* [[A_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load double, double* [[A_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store double [[TMP0]], double* [[F]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_i
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC2ERKS0_i(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[TMP0]], i32 noundef [[TMP1]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC2Ei(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiED2Ev(%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[F]], align 4
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_i
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], %struct.S.0* noundef nonnull align 4 dereferenceable(4) [[S:%.*]], i32 noundef [[T:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[S_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[T_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[S]], %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[T]], i32* [[T_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load %struct.S.0*, %struct.S.0** [[S_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], %struct.S.0* [[TMP0]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[F2]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[T_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]]
+// SIMD-ONLY1-NEXT:    store i32 [[ADD]], i32* [[F]], align 4
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], %struct.S.0* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    store i32 [[TMP0]], i32* [[F]], align 4
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// SIMD-ONLY1-SAME: (%struct.S.0* noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S.0*, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S.0* [[THIS]], %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S.0*, %struct.S.0** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@main
+// SIMD-ONLY2-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY2-NEXT:  entry:
+// SIMD-ONLY2-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 1
+// SIMD-ONLY2-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY2-NEXT:    call void @"_ZZ4mainENK3$_0clEv"(%class.anon* noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// SIMD-ONLY2-NEXT:    ret i32 0
+//
+//
+// SIMD-ONLY3-LABEL: define {{[^@]+}}@main
+// SIMD-ONLY3-SAME: () #[[ATTR1:[0-9]+]] {
+// SIMD-ONLY3-NEXT:  entry:
+// SIMD-ONLY3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SIMD-ONLY3-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// SIMD-ONLY3-NEXT:    [[TMP0:%.*]] = load i8*, i8** getelementptr inbounds ([[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to %struct.__block_literal_generic*), i32 0, i32 3), align 8
+// SIMD-ONLY3-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to void (i8*)*
+// SIMD-ONLY3-NEXT:    call void [[TMP1]](i8* noundef bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to i8*))
+// SIMD-ONLY3-NEXT:    ret i32 0
+//
+//
+// SIMD-ONLY3-LABEL: define {{[^@]+}}@__main_block_invoke
+// SIMD-ONLY3-SAME: (i8* noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] {
+// SIMD-ONLY3-NEXT:  entry:
+// SIMD-ONLY3-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca i8*, align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_ADDR:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*, align 8
+// SIMD-ONLY3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// SIMD-ONLY3-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// SIMD-ONLY3-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// SIMD-ONLY3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// SIMD-ONLY3-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY3-NEXT:    [[BLOCK2:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, align 8
+// SIMD-ONLY3-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>*
+// SIMD-ONLY3-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>** [[BLOCK_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY3-NEXT:    store i64 9, i64* [[DOTOMP_UB]], align 8
+// SIMD-ONLY3-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY3-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// SIMD-ONLY3-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4
+// SIMD-ONLY3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// SIMD-ONLY3:       omp.inner.for.cond:
+// SIMD-ONLY3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]]
+// SIMD-ONLY3-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP1]] to i64
+// SIMD-ONLY3-NEXT:    [[TMP2:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV1]], [[TMP2]]
+// SIMD-ONLY3-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// SIMD-ONLY3:       omp.inner.for.body:
+// SIMD-ONLY3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
+// SIMD-ONLY3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// SIMD-ONLY3-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    store double 1.000000e+00, double* @g, align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    store i32 11, i32* @_ZZ4mainE5sivar, align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[BLOCK_ISA:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK2]], i32 0, i32 0
+// SIMD-ONLY3-NEXT:    store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** [[BLOCK_ISA]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[BLOCK_FLAGS:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK2]], i32 0, i32 1
+// SIMD-ONLY3-NEXT:    store i32 1073741824, i32* [[BLOCK_FLAGS]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[BLOCK_RESERVED:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK2]], i32 0, i32 2
+// SIMD-ONLY3-NEXT:    store i32 0, i32* [[BLOCK_RESERVED]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK2]], i32 0, i32 3
+// SIMD-ONLY3-NEXT:    store i8* bitcast (void (i8*)* @__main_block_invoke_2 to i8*), i8** [[BLOCK_INVOKE]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[BLOCK_DESCRIPTOR:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK2]], i32 0, i32 4
+// SIMD-ONLY3-NEXT:    store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8* }* @__block_descriptor_tmp.1 to %struct.__block_descriptor*), %struct.__block_descriptor** [[BLOCK_DESCRIPTOR]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK2]], i32 0, i32 5
+// SIMD-ONLY3-NEXT:    [[TMP4:%.*]] = load volatile double, double* @g, align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    store volatile double [[TMP4]], double* [[BLOCK_CAPTURED]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[BLOCK_CAPTURED3:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK2]], i32 0, i32 6
+// SIMD-ONLY3-NEXT:    [[TMP5:%.*]] = load i32, i32* @_ZZ4mainE5sivar, align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    store i32 [[TMP5]], i32* [[BLOCK_CAPTURED3]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[TMP6:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK2]] to void ()*
+// SIMD-ONLY3-NEXT:    [[BLOCK_LITERAL:%.*]] = bitcast void ()* [[TMP6]] to %struct.__block_literal_generic*
+// SIMD-ONLY3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], %struct.__block_literal_generic* [[BLOCK_LITERAL]], i32 0, i32 3
+// SIMD-ONLY3-NEXT:    [[TMP8:%.*]] = bitcast %struct.__block_literal_generic* [[BLOCK_LITERAL]] to i8*
+// SIMD-ONLY3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP7]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to void (i8*)*
+// SIMD-ONLY3-NEXT:    call void [[TMP10]](i8* noundef [[TMP8]]), !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// SIMD-ONLY3:       omp.body.continue:
+// SIMD-ONLY3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// SIMD-ONLY3:       omp.inner.for.inc:
+// SIMD-ONLY3-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
+// SIMD-ONLY3-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// SIMD-ONLY3:       omp.inner.for.end:
+// SIMD-ONLY3-NEXT:    store i32 10, i32* [[I]], align 4
+// SIMD-ONLY3-NEXT:    ret void
+//
+//
+// SIMD-ONLY3-LABEL: define {{[^@]+}}@__main_block_invoke_2
+// SIMD-ONLY3-SAME: (i8* noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2]] {
+// SIMD-ONLY3-NEXT:  entry:
+// SIMD-ONLY3-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca i8*, align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_ADDR:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>*, align 8
+// SIMD-ONLY3-NEXT:    store i8* [[DOTBLOCK_DESCRIPTOR]], i8** [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK:%.*]] = bitcast i8* [[DOTBLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>*
+// SIMD-ONLY3-NEXT:    store <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>** [[BLOCK_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], i32 0, i32 5
+// SIMD-ONLY3-NEXT:    store double 2.000000e+00, double* [[BLOCK_CAPTURE_ADDR]], align 8
+// SIMD-ONLY3-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, double, i32 }>* [[BLOCK]], i32 0, i32 6
+// SIMD-ONLY3-NEXT:    store i32 22, i32* [[BLOCK_CAPTURE_ADDR1]], align 8
+// SIMD-ONLY3-NEXT:    ret void
+//
+//
+// SIMD-ONLY4-LABEL: define {{[^@]+}}@_Z10array_funciPfP2St
+// SIMD-ONLY4-SAME: (i32 noundef [[N:%.*]], float* noundef [[A:%.*]], %struct.St* noundef [[S:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY4-NEXT:  entry:
+// SIMD-ONLY4-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY4-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// SIMD-ONLY4-NEXT:    [[S_ADDR:%.*]] = alloca %struct.St*, align 8
+// SIMD-ONLY4-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// SIMD-ONLY4-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// SIMD-ONLY4-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// SIMD-ONLY4-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// SIMD-ONLY4-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY4-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// SIMD-ONLY4-NEXT:    store %struct.St* [[S]], %struct.St** [[S_ADDR]], align 8
+// SIMD-ONLY4-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY4-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// SIMD-ONLY4-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY4-NEXT:    store i64 9, i64* [[DOTOMP_UB]], align 8
+// SIMD-ONLY4-NEXT:    [[TMP2:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// SIMD-ONLY4-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP2]] to i32
+// SIMD-ONLY4-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4
+// SIMD-ONLY4-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// SIMD-ONLY4:       omp.inner.for.cond:
+// SIMD-ONLY4-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]]
+// SIMD-ONLY4-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP3]] to i64
+// SIMD-ONLY4-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY4-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV1]], [[TMP4]]
+// SIMD-ONLY4-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// SIMD-ONLY4:       omp.inner.for.body:
+// SIMD-ONLY4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY4-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 1
+// SIMD-ONLY4-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// SIMD-ONLY4-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY4-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// SIMD-ONLY4:       omp.body.continue:
+// SIMD-ONLY4-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// SIMD-ONLY4:       omp.inner.for.inc:
+// SIMD-ONLY4-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY4-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// SIMD-ONLY4-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
+// SIMD-ONLY4-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// SIMD-ONLY4:       omp.inner.for.end:
+// SIMD-ONLY4-NEXT:    store i32 10, i32* [[I]], align 4
+// SIMD-ONLY4-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/target_firstprivate_codegen.cpp b/clang/test/OpenMP/target_firstprivate_codegen.cpp
index 4f8f5ae37472c..05a6e891204f2 100644
--- a/clang/test/OpenMP/target_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_firstprivate_codegen.cpp
@@ -1,38 +1,37 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test host codegen.
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK0
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK1
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK2
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK3
 
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY01 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY02 %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY03 %s
 
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK1
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK2
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK3
 
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY11 %s
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY12 %s
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
-// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY13 %s
 
 // expected-no-diagnostics
 #ifndef HEADER
@@ -47,28 +46,9 @@ struct TT {
 int ga = 5;
 #pragma omp end declare target
 
-// CHECK-DAG:  [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG:  [[TTII:%.+]] = type { i32, i32 }
-// CHECK-DAG:  [[S1:%.+]] = type { double }
-
-// TCHECK-DAG:  [[TT:%.+]] = type { i64, i8 }
-// TCHECK-DAG:  [[TTII:%.+]] = type { i32, i32 }
-// TCHECK-DAG:  [[S1:%.+]] = type { double }
-
-// CHECK-DAG:  [[SIZET:@.+]] = private unnamed_addr constant [3 x i{{32|64}}] [i[[SZ:32|64]] 4, i{{64|32}} {{8|4}}, i[[SZ:32|64]] 4]
-// CHECK-DAG:  [[MAPT:@.+]] = private unnamed_addr constant [3 x i64] [i64 288, i64 49, i64 288]
-// CHECK-DAG:  [[SIZET2:@.+]] = private unnamed_addr constant [9 x i64] [i64 2, i64 40, i64 {{4|8}}, i64 0, i64 400, i64 {{4|8}}, i64 {{4|8}}, i64 0, i64 {{12|16}}]
-// CHECK-DAG:  [[MAPT2:@.+]] = private unnamed_addr constant [9 x i64] [i64 288, i64 161, i64 800, i64 161, i64 161, i64 800, i64 800, i64 161, i64 161]
-// CHECK-DAG:  [[SIZET3:@.+]] = private unnamed_addr constant [2 x i{{32|64}}] [i{{32|64}} 0, i{{32|64}} 8]
-// CHECK-DAG:  [[MAPT3:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 161]
-// CHECK-DAG:  [[SIZET4:@.+]] = private unnamed_addr constant [5 x i64] [i64 8, i64 4, i64 {{4|8}}, i64 {{4|8}}, i64 0]
-// CHECK-DAG:  [[MAPT4:@.+]] = private unnamed_addr constant [5 x i64] [i64 547, i64 288, i64 800, i64 800, i64 161]
-// CHECK-DAG:  [[SIZET5:@.+]] = private unnamed_addr constant [3 x i{{32|64}}] [i[[SZ]] 4, i[[SZ]] 1, i[[SZ]] 40]
-// CHECK-DAG:  [[MAPT5:@.+]] = private unnamed_addr constant [3 x i64] [i64 288, i64 288, i64 161]
-// CHECK-DAG:  [[SIZET6:@.+]] = private unnamed_addr constant [2 x i{{32|64}}] [i[[SZ]] 4, i[[SZ]] 40]
-// CHECK-DAG:  [[MAPT6:@.+]] = private unnamed_addr constant [2 x i64] [i64 288, i64 161]
-
-// CHECK: define {{.*}}[[FOO:@.+]](
+
+
+
 int foo(int n, double *ptr) {
   int a = 0;
   short aa = 0;
@@ -85,75 +65,7 @@ int foo(int n, double *ptr) {
   }
 
   // a is passed by value to tgt_target
-  // CHECK:  [[N_ADDR:%.+]] = alloca i{{[0-9]+}},
-  // CHECK:  [[PTR_ADDR:%.+]] = alloca double*,
-  // CHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
-  // CHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
-  // CHECK:  [[B:%.+]] = alloca [10 x float],
-  // CHECK:  [[SSTACK:%.+]] = alloca i8*,
-  // CHECK:  [[C:%.+]] = alloca [5 x [10 x double]],
-  // CHECK:  [[D:%.+]] = alloca [[TT]],
-  // CHECK:  [[FP_E:%.+]] = alloca [[TTII]],
-  // CHECK:  [[P:%.+]] = alloca i32*, align 64
-  // CHECK:  [[ACAST:%.+]] = alloca i{{[0-9]+}},
-  // CHECK:  [[BASE_PTR_ARR:%.+]] = alloca [3 x i8*],
-  // CHECK:  [[PTR_ARR:%.+]] = alloca [3 x i8*],
-  // CHECK:  [[A2CAST:%.+]] = alloca i{{[0-9]+}},
-  // CHECK:  [[BASE_PTR_ARR2:%.+]] = alloca [9 x i8*],
-  // CHECK:  [[PTR_ARR2:%.+]] = alloca [9 x i8*],
-  // CHECK:  [[SIZET2:%.+]] = alloca [9 x i{{[0-9]+}}],
-  // CHECK:  [[BASE_PTR_ARR3:%.+]] = alloca [2 x i8*],
-  // CHECK:  [[PTR_ARR3:%.+]] = alloca [2 x i8*],
-  // CHECK:  [[N_ADDR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[N_ADDR]],
-  // CHECK-64:  [[N_EXT:%.+]] = zext i{{[0-9]+}} [[N_ADDR_VAL]] to i{{[0-9]+}}
-  // CHECK:  [[SSAVE_RET:%.+]] = call i8* @llvm.stacksave()
-  // CHECK:  store i8* [[SSAVE_RET]], i8** [[SSTACK]],
-  // CHECK-64:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_EXT]],
-  // CHECK-32:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_ADDR_VAL]],
-  // CHECK:  [[N_ADDR_VAL2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[N_ADDR]],
-  // CHECK-64:  [[N_EXT2:%.+]] = zext i{{[0-9]+}} [[N_ADDR_VAL2]] to i{{[0-9]+}}
-  // CHECK-64:  [[CN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_EXT2]]
-  // CHECK-32:  [[CN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_ADDR_VAL2]]
-  // CHECK:  [[CN_VLA:%.+]] = alloca double, i{{[0-9]+}} [[CN_SIZE]],
-  // CHECK:  [[AVAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A]],
-  // CHECK-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ACAST]] to i{{[0-9]+}}*
-  // CHECK-64:  store i{{[0-9]+}} [[AVAL]], i{{[0-9]+}}* [[CONV]],
-  // CHECK-32:  store i{{[0-9]+}} [[AVAL]], i{{[0-9]+}}* [[ACAST]],
-  // CHECK:  [[ACAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ACAST]],
-  // CHECK:  [[P_PTR:%.+]] = load i32*, i32** [[P]], align 64
-  // CHECK:  [[BASE_PTR_GEP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[ACAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[ACAST_VAL]], i{{[0-9]+}}* [[ACAST_TOPTR]],
-  // CHECK:  [[PTR_GEP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[ACAST_TOPTR2:%.+]] = bitcast i8** [[PTR_GEP]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[ACAST_VAL]], i{{[0-9]+}}* [[ACAST_TOPTR2]],
-  // CHECK:  [[BASE_PTR_GEP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[PCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP]] to i32***
-  // CHECK:  store i32** [[P]], i32*** [[PCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[PCAST_TOPTR2:%.+]] = bitcast i8** [[PTR_GEP]] to i32**
-  // CHECK:  store i32* [[P_PTR]], i32** [[PCAST_TOPTR2]],
-  // CHECK:  [[BASE_PTR_GEP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-  // CHECK:  [[PCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP]] to i{{64|32}}*
-  // CHECK:  store i{{64|32}} [[GA_VAL:%.*]], i{{64|32}}* [[PCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-  // CHECK:  [[PCAST_TOPTR2:%.+]] = bitcast i8** [[PTR_GEP]] to i{{64|32}}*
-  // CHECK:  store i{{64|32}} [[GA_VAL]], i{{64|32}}* [[PCAST_TOPTR2]],
-  // CHECK:  [[BASE_PTR_GEP_ARG:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[PTR_GEP_ARG:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  {{.+}} = call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 -1, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* [[ARGS:%.+]])
-
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(i{{[0-9]+}} noundef [[A_IN:%.+]], i32** noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[P_IN:%.+]], i{{[0-9]+}} noundef [[GA_IN:%.+]])
-  // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK:  [[P_ADDR:%.+]] = alloca i32**,
-  // TCHECK:  [[GA_ADDR:%.+]] = alloca i{{64|32}},
-  // TCHECK:  [[P_PRIV:%.+]] = alloca i32*,
-  // TCHECK-NOT: alloca i{{[0-9]+}}
-  // TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
-  // TCHECK:  store i32** [[P_IN]], i32*** [[P_ADDR]],
-  // TCHECK:  store i{{[0-9]+}} [[GA_IN]], i{{[0-9]+}}* [[GA_ADDR]],
-  // TCHECK-NOT: store i{{[0-9]+}} %
-  // TCHECK:  ret void
+
 
 #pragma omp target firstprivate(aa, b, bn, c, cn, d)
   {
@@ -166,198 +78,44 @@ int foo(int n, double *ptr) {
     d.Y = 1;
   }
 
-  // CHECK:  [[A2VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A2]],
-  // CHECK:  [[A2CASTCONV:%.+]] = bitcast i{{[0-9]+}}* [[A2CAST]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[A2VAL]], i{{[0-9]+}}* [[A2CASTCONV]],
-  // CHECK:  [[A2CAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A2CAST]],
-  // CHECK-64:  [[BN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_EXT]], 4
-  // CHECK-32:  [[BN_SZ_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_ADDR_VAL]], 4
-  // CHECK-32:  [[BN_SIZE:%.+]] = sext i32 [[BN_SZ_SIZE]] to i64
-  // CHECK-64:  [[CN_SIZE_1:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_EXT2]]
-  // CHECK-32:  [[CN_SIZE_1:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_ADDR_VAL2]]
-  // CHECK-64:  [[CN_SIZE_2:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SIZE_1]], 8
-  // CHECK-32:  [[CN_SZ_SIZE_2:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SIZE_1]], 8
-  // CHECK-32:  [[CN_SIZE_2:%.+]] = sext i32 [[CN_SZ_SIZE_2]] to i64
 
   // firstprivate(aa) --> base_ptr = aa, ptr = aa, size = 2 (short)
-  // CHECK:  [[BASE_PTR_GEP2_0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[ACAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_0]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[A2CAST_VAL]], i{{[0-9]+}}* [[ACAST_TOPTR]],
-  // CHECK:  [[PTR_GEP2_0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[ACAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP2_0]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[A2CAST_VAL]], i{{[0-9]+}}* [[ACAST_TOPTR]],
 
   // firstprivate(b): base_ptr = &b[0], ptr = &b[0], size = 40 (sizeof(float)*10)
-  // CHECK:  [[BASE_PTR_GEP2_1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_1]] to [10 x float]**
-  // CHECK:  store [10 x float]* [[B]], [10 x float]** [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP2_1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP2_1]] to [10 x float]**
-  // CHECK:  store [10 x float]* [[B]], [10 x float]** [[BCAST_TOPTR]],
 
   // firstprivate(bn), 2 entries, n and bn: (1) base_ptr = n, ptr = n, size = 8 ; (2) base_ptr = &c[0], ptr = &c[0], size = n*sizeof(float)
-  // CHECK:  [[BASE_PTR_GEP2_2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_2]] to i{{[0-9]+}}*
-  // CHECK-64:  store i{{[0-9]+}} [[N_EXT]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK-32:  store i{{[0-9]+}} [[N_ADDR_VAL]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP2_2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP2_2]] to i{{[0-9]+}}*
-  // CHECK-64:  store i{{[0-9]+}} [[N_EXT]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK-32:  store i{{[0-9]+}} [[N_ADDR_VAL]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[BASE_PTR_GEP2_3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_3]] to float**
-  // CHECK:  store float* [[BN_VLA]], float** [[BCAST_TOPTR]],
-  // CHECK: [[SIZE_GEPBN_3:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
-  // CHECK:  store i{{[0-9]+}} [[BN_SIZE]], i{{[0-9]+}}* [[SIZE_GEPBN_3]]
 
   // firstprivate(c): base_ptr = &c[0], ptr = &c[0], size = 400 (5*10*sizeof(double))
-  // CHECK:  [[BASE_PTR_GEP2_4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_4]] to [5 x [10 x double]]**
-  // CHECK:  store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP2_4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP2_4]] to [5 x [10 x double]]**
-  // CHECK:  store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[BCAST_TOPTR]],
 
   // firstprivate(cn), 3 entries, 5, n, cn: (1) base_ptr = 5, ptr = 5, size = 8; (2) (1) base_ptr = n, ptr = n, size = 8; (3) base_ptr = &cn[0], ptr = &cn[0], size = 5*n*sizeof(double)
-  // CHECK:  [[BASE_PTR_GEP2_5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_5]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} 5, i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP2_5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP2_5]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} 5, i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[BASE_PTR_GEP2_6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 6
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_6]] to i{{[0-9]+}}*
-  // CHECK-64:  store i{{[0-9]+}} [[N_EXT2]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK-32:  store i{{[0-9]+}} [[N_ADDR_VAL2]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP2_6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 6
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP2_6]] to i{{[0-9]+}}*
-  // CHECK-64:  store i{{[0-9]+}} [[N_EXT2]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK-32:  store i{{[0-9]+}} [[N_ADDR_VAL2]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[BASE_PTR_GEP2_7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_7]] to double**
-  // CHECK:  store double* [[CN_VLA]], double** [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP2_7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP2_7]] to double**
-  // CHECK:  store double* [[CN_VLA]], double** [[BCAST_TOPTR]],
-  // CHECK:  [[SIZE_GEPCN_7:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
-  // CHECK:  store i{{[0-9]+}} [[CN_SIZE_2]], i{{[0-9]+}}* [[SIZE_GEPCN_7]],
 
   // firstprivate(d): base_ptr = &d, ptr = &d, size = 16
-  // CHECK:  [[BASE_PTR_GEP2_8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_8]] to [[TT]]**
-  // CHECK:  store [[TT]]* [[D]], [[TT]]** [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP2_8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP2_8]] to [[TT]]**
-  // CHECK:  store [[TT]]* [[D]], [[TT]]** [[BCAST_TOPTR]],
-
-  // CHECK:  [[BASE_PTR_GEP_ARG2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[PTR_GEP_ARG2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[SIZES_ARG2:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[SIZET2]],  i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  {{.+}} = call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 -1, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* [[ARGS:%.+]])
+
 
   // make sure that firstprivate variables are generated in all cases and that we use those instances for operations inside the
   // target region
-  // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}(i{{[0-9]+}} noundef [[A2_IN:%.+]], [10 x float]* {{.+}} [[B_IN:%.+]], i{{[0-9]+}} noundef [[BN_SZ:%.+]], float* {{.+}} [[BN_IN:%.+]], [5 x [10 x double]]* {{.+}} [[C_IN:%.+]], i{{[0-9]+}} noundef [[CN_SZ1:%.+]], i{{[0-9]+}} noundef [[CN_SZ2:%.+]], double* {{.+}} [[CN_IN:%.+]], [[TT]]* {{.+}} [[D_IN:%.+]])
-  // TCHECK:  [[A2_ADDR:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK:  [[B_ADDR:%.+]] = alloca [10 x float]*,
-  // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK:  [[BN_ADDR:%.+]] = alloca float*,
-  // TCHECK:  [[C_ADDR:%.+]] = alloca [5 x [10 x double]]*,
-  // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK:  [[VLA_ADDR4:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK:  [[CN_ADDR:%.+]] = alloca double*,
-  // TCHECK:  [[D_ADDR:%.+]] = alloca [[TT]]*,
-  // TCHECK-NOT: alloca i{{[0-9]+}},
-  // TCHECK:  [[B_PRIV:%.+]] = alloca [10 x float],
-  // TCHECK:  [[SSTACK:%.+]] = alloca i8*,
-  // TCHECK:  [[C_PRIV:%.+]] = alloca [5 x [10 x double]],
-  // TCHECK:  [[D_PRIV:%.+]] = alloca [[TT]],
-  // TCHECK:  store i{{[0-9]+}} [[A2_IN]], i{{[0-9]+}}* [[A2_ADDR]],
-  // TCHECK:  store [10 x float]* [[B_IN]], [10 x float]** [[B_ADDR]],
-  // TCHECK:  store i{{[0-9]+}} [[BN_SZ]], i{{[0-9]+}}* [[VLA_ADDR]],
-  // TCHECK:  store float* [[BN_IN]], float** [[BN_ADDR]],
-  // TCHECK:  store [5 x [10 x double]]* [[C_IN]], [5 x [10 x double]]** [[C_ADDR]],
-  // TCHECK:  store i{{[0-9]+}} [[CN_SZ1]], i{{[0-9]+}}* [[VLA_ADDR2]],
-  // TCHECK:  store i{{[0-9]+}} [[CN_SZ2]], i{{[0-9]+}}* [[VLA_ADDR4]],
-  // TCHECK:  store double* [[CN_IN]], double** [[CN_ADDR]],
-  // TCHECK:  store [[TT]]* [[D_IN]], [[TT]]** [[D_ADDR]],
-  // TCHECK:  [[CONV_A2ADDR:%.+]] = bitcast i{{[0-9]+}}* [[A2_ADDR]] to i{{[0-9]+}}*
-  // TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x float]*, [10 x float]** [[B_ADDR]],
-  // TCHECK:  [[BN_SZ_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
-  // TCHECK:  [[BN_ADDR_REF:%.+]] = load float*, float** [[BN_ADDR]],
-  // TCHECK:  [[C_ADDR_REF:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]],
-  // TCHECK:  [[CN_SZ1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
-  // TCHECK:  [[CN_SZ2_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR4]],
-  // TCHECK:  [[CN_ADDR_REF:%.+]] = load double*, double** [[CN_ADDR]],
-  // TCHECK:  [[D_ADDR_REF:%.+]] = load [[TT]]*, [[TT]]** [[D_ADDR]],
 
   // firstprivate(aa): a_priv = a_in
-  // TCHECK-NOT:  store i{{[0-9]+}} %
 
   //  firstprivate(b): memcpy(b_priv,b_in)
-  // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
-  // TCHECK:  [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_ADDR_REF_BCAST]], {{.+}})
 
-  // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
-  // TCHECK:  store i8* [[RET_STACK]], i8** [[SSTACK]],
 
   // firstprivate(bn)
-  // TCHECK:  [[BN_PRIV:%.+]] = alloca float, i{{[0-9]+}} [[BN_SZ_VAL]],
-  // TCHECK:  [[BN_COPY_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[BN_SZ_VAL]], 4
-  // TCHECK:  [[BN_PRIV__BCAST:%.+]] = bitcast float* [[BN_PRIV]] to i8*
-  // TCHECK:  [[BN_REF_IN_BCAST:%.+]] = bitcast float* [[BN_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BN_PRIV__BCAST]], i8* align {{[0-9]+}} [[BN_REF_IN_BCAST]], i{{[0-9]+}} [[BN_COPY_SZ]],{{.+}})
 
   // firstprivate(c)
-  // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
-  // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
 
   // firstprivate(cn)
-  // TCHECK:  [[CN_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
-  // TCHECK:  [[CN_PRIV:%.+]] = alloca double, i{{[0-9]+}} [[CN_SZ]],
-  // TCHECK:  [[CN_SZ2:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
-  // TCHECK:  [[CN_SZ2_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ2]], 8
-  // TCHECK:  [[CN_PRIV_BCAST:%.+]] = bitcast double* [[CN_PRIV]] to i8*
-  // TCHECK:  [[CN_IN_BCAST:%.+]] = bitcast double* [[CN_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[CN_PRIV_BCAST]], i8* align {{[0-9]+}} [[CN_IN_BCAST]], i{{[0-9]+}} [[CN_SZ2_CPY]],{{.+}})
 
   // firstprivate(d)
-  // TCHECK:  [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
-  // TCHECK:  [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[D_PRIV_BCAST]], i8* align {{[0-9]+}} [[D_IN_BCAST]],{{.+}})
 
 #pragma omp target firstprivate(ptr, e)
   {
     ptr[0] = e.X;
     ptr[0]++;
   }
-  // CHECK:  [[PTR_ADDR_REF:%.+]] = load double*, double** [[PTR_ADDR]],
-
-  // CHECK:  [[BASE_PTR_GEP3_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP3_0]] to double**
-  // CHECK:  store double* [[PTR_ADDR_REF]], double** [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP3_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP3_0]] to double**
-  // CHECK:  store double* [[PTR_ADDR_REF]], double** [[BCAST_TOPTR]],
-  // CHECK:  [[BASE_PTR_GEP3_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP3_1]] to [[TTII]]**
-  // CHECK:  store [[TTII]]* [[FP_E]], [[TTII]]** [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP3_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP3_1]] to [[TTII]]**
-  // CHECK:  store [[TTII]]* [[FP_E]], [[TTII]]** [[BCAST_TOPTR]],
-
-  // CHECK:  [[BASE_PTR_GEP_ARG3:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[PTR_GEP_ARG3:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  {{.+}} = call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 -1, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* [[ARGS:%.+]])
-
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(double* noundef [[PTR_IN:%.+]], [[TTII]]* noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[E:%.+]])
-  // TCHECK:  [[PTR_ADDR:%.+]] = alloca double*,
-  // TCHECK-NOT: alloca [[TTII]],
-  // TCHECK-NOT: alloca double*,
-  // TCHECK:  store double* [[PTR_IN]], double** [[PTR_ADDR]],
-  // TCHECK-NOT: store double* %
+
+
+
 
   return a;
 }
@@ -391,30 +149,13 @@ static int fstatic(int n) {
   return a;
 }
 
-// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(i{{[0-9]+}} noundef [[A_IN:%.+]], i{{[0-9]+}} noundef [[A3_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
-// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
-// TCHECK:  [[A3_ADDR:%.+]] = alloca i{{[0-9]+}},
-// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
-// TCHECK-NOT: alloca i{{[0-9]+}},
-// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
-// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
-// TCHECK:  store i{{[0-9]+}} [[A3_IN]], i{{[0-9]+}}* [[A3_ADDR]],
-// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
-// TCHECK-64:  [[A_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
-// TCHECK:  [[A3_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A3_ADDR]] to i8*
-// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
 
 // firstprivate(a): a_priv = a_in
 
 // firstprivate(aaa)
-// TCHECK-NOT:  store i{{[0-9]+}} %
 
 // firstprivate(b)
-// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
-// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
 
-// TCHECK:  ret void
 
 struct S1 {
   double a;
@@ -433,123 +174,33 @@ struct S1 {
   }
 
   // on the host side, we first generate r1, then the static function and the template above
-  // CHECK:  define{{.+}} i32 {{.+}}([[S1]]* {{.+}}, i{{[0-9]+}} {{.+}})
-  // CHECK:  [[BASE_PTRS4:%.+]] = alloca [5 x i8*],
-  // CHECK:  [[PTRS4:%.+]] = alloca [5 x i8*],
-  // CHECK:  [[SIZET4:%.+]] = alloca [5 x i{{[0-9]+}}],
 
   // map(this: this ptr is implicitly captured (not firstprivate matter)
-  // CHECK:  [[BP0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[CBP0:%.+]] = bitcast i8** [[BP0]] to %struct.S1**
-  // CHECK:  store %struct.S1* [[THIS:%.+]], %struct.S1** [[CBP0]],
-  // CHECK:  [[P0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[CP0:%.+]] = bitcast i8** [[P0]] to double**
-  // CHECK:  store double* [[A:%.+]], double** [[CP0]],
 
   // firstprivate(b): base_ptr = b, ptr = b, size = 4 (pass by-value)
-  // CHECK:  [[BASE_PTRS_GEP4_1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP4_1]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[B_CAST:%.+]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[PTRS_GEP4_1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP4_1]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[B_CAST]], i{{[0-9]+}}* [[BCAST_TOPTR]],
 
   // firstprivate(c), 3 entries: 2, n, c
-  // CHECK:  [[BASE_PTRS_GEP4_2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP4_2]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} 2, i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[PTRS_GEP4_2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP4_2]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} 2, i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[BASE_PTRS_GEP4_3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP4_3]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[N:%.+]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[PTRS_GEP4_3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP4_3]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[N]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[BASE_PTRS_GEP4_4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP4_4]] to i{{[0-9]+}}**
-  // CHECK:  store i{{[0-9]+}}* [[B:%.+]], i{{[0-9]+}}** [[BCAST_TOPTR]],
-  // CHECK:  [[PTRS_GEP4_4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP4_4]] to i{{[0-9]+}}**
-  // CHECK:  store i{{[0-9]+}}* [[B]], i{{[0-9]+}}** [[BCAST_TOPTR]],
-  // CHECK:  [[SIZES_GEP4_4:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
-  // CHECK:  store i{{[0-9]+}} [[B_SIZE:%.+]], i{{[0-9]+}}* [[SIZES_GEP4_4]],
 
   // only check that we use the map types stored in the global variable
-  // CHECK:  {{.+}} = call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 -1, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* [[ARGS:%.+]])
-
-  // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}([[S1]]* noundef [[TH:%.+]], i{{[0-9]+}} noundef [[B_IN:%.+]], i{{[0-9]+}} noundef [[VLA:%.+]], i{{[0-9]+}} noundef [[VLA1:%.+]], i{{[0-9]+}}{{.+}} [[C_IN:%.+]])
-  // TCHECK:  [[TH_ADDR:%.+]] = alloca [[S1]]*,
-  // TCHECK:  [[B_ADDR:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK:  [[C_ADDR:%.+]] = alloca i{{[0-9]+}}*,
-  // TCHECK-NOT: alloca i{{[0-9]+}},
-  // TCHECK:  [[SSTACK:%.+]] = alloca i8*,
-
-  // TCHECK:  store [[S1]]* [[TH]], [[S1]]** [[TH_ADDR]],
-  // TCHECK:  store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[B_ADDR]],
-  // TCHECK:  store i{{[0-9]+}} [[VLA]], i{{[0-9]+}}* [[VLA_ADDR]],
-  // TCHECK:  store i{{[0-9]+}} [[VLA1]], i{{[0-9]+}}* [[VLA_ADDR2]],
-  // TCHECK:  store i{{[0-9]+}}* [[C_IN]], i{{[0-9]+}}** [[C_ADDR]],
-  // TCHECK:  [[TH_ADDR_REF:%.+]] = load [[S1]]*, [[S1]]** [[TH_ADDR]],
-  // TCHECK-64:  [[B_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[B_ADDR]] to i{{[0-9]+}}*
-  // TCHECK:  [[VLA_ADDR_REF:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
-  // TCHECK:  [[VLA_ADDR_REF2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
-  // TCHECK:  [[C_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[C_ADDR]],
+
+
 
   // firstprivate(b)
-  // TCHECK-NOT:  store i{{[0-9]+}} %
 
-  // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
-  // TCHECK:  store i8* [[RET_STACK:%.+]], i8** [[SSTACK]],
 
   // firstprivate(c)
-  // TCHECK:  [[C_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF]], [[VLA_ADDR_REF2]]
-  // TCHECK:  [[C_PRIV:%.+]] = alloca i{{[0-9]+}}, i{{[0-9]+}} [[C_SZ]],
-  // TCHECK:  [[C_SZ2:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF]], [[VLA_ADDR_REF2]]
-  // TCHECK:  [[C_SZ_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[C_SZ2]],  2
-  // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_PRIV]] to i8*
-  // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
 
   // finish
-  // TCHECK: [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
-  // TCHECK: call void @llvm.stackrestore(i8* [[RELOAD_SSTACK]])
-  // TCHECK: ret void
 
   // static host function
-  // CHECK:  define{{.+}} i32 {{.+}}(i{{[0-9]+}} {{.+}})
-  // CHECK:  [[BASE_PTRS5:%.+]] = alloca [3 x i8*],
-  // CHECK:  [[PTRS5:%.+]] = alloca [3 x i8*],
 
   // firstprivate(a): by value
-  // CHECK:  [[BASE_PTRS_GEP5_0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP5_0]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[A_CAST:%.+]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[PTRS_GEP5_0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP5_0]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[A_CAST]], i{{[0-9]+}}* [[BCAST_TOPTR]],
 
   // firstprivate(aaa): by value
-  // CHECK:  [[BASE_PTRS_GEP5_1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP5_1]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[A3_CAST:%.+]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-  // CHECK:  [[PTRS_GEP5_1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP5_1]] to i{{[0-9]+}}*
-  // CHECK:  store i{{[0-9]+}} [[A3_CAST]], i{{[0-9]+}}* [[BCAST_TOPTR]],
 
   // firstprivate(b): base_ptr = &b[0], ptr= &b[0]
-  // CHECK:  [[BASE_PTRS_GEP5_2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP5_2]] to [10 x i{{[0-9]+}}]**
-  // CHECK:  store [10 x i{{[0-9]+}}]* [[B:%.+]], [10 x i{{[0-9]+}}]** [[BCAST_TOPTR]],
-  // CHECK:  [[PTRS_GEP5_2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP5_2]] to [10 x i{{[0-9]+}}]**
-  // CHECK:  store [10 x i{{[0-9]+}}]* [[B]], [10 x i{{[0-9]+}}]** [[BCAST_TOPTR]],
 
   // only check that the right sizes and map types are used
-  // CHECK:  {{.+}} = call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 -1, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* [[ARGS:%.+]])
 };
 
 int bar(int n, double *ptr) {
@@ -565,46 +216,11786 @@ int bar(int n, double *ptr) {
 
 // template host and device
 
-// CHECK:  define{{.+}} i32 {{.+}}(i{{[0-9]+}} {{.+}})
-// CHECK:  [[BASE_PTRS6:%.+]] = alloca [2 x i8*],
-// CHECK:  [[PTRS6:%.+]] = alloca [2 x i8*],
 
 // firstprivate(a): by value
-// CHECK:  [[BASE_PTRS_GEP6_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP6_0]] to i{{[0-9]+}}*
-// CHECK:  store i{{[0-9]+}} [[AT_CAST:%.+]], i{{[0-9]+}}* [[BCAST_TOPTR]],
-// CHECK:  [[PTRS_GEP6_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP6_0]] to i{{[0-9]+}}*
-// CHECK:  store i{{[0-9]+}} [[AT_CAST]], i{{[0-9]+}}* [[BCAST_TOPTR]],
 
 // firstprivate(b): pointer
-// CHECK:  [[BASE_PTRS_GEP6_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTRS_GEP6_1]] to [10 x i{{[0-9]+}}]**
-// CHECK:  store [10 x i{{[0-9]+}}]* [[B:%.+]], [10 x i{{[0-9]+}}]** [[BCAST_TOPTR]],
-// CHECK:  [[PTRS_GEP6_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTRS_GEP6_1]] to [10 x i{{[0-9]+}}]**
-// CHECK:  store [10 x i{{[0-9]+}}]* [[B]], [10 x i{{[0-9]+}}]** [[BCAST_TOPTR]],
-
-// CHECK:  {{.+}} = call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 -1, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* [[ARGS:%.+]])
-
-// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(i{{[0-9]+}} noundef [[A_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
-// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
-// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
-// TCHECK-NOT: alloca i{{[0-9]+}},
-// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
-// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
-// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
-// TCHECK-64:  [[A_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
-// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+
 
 // firstprivate(a)
-// TCHECK-NOT:  store i{{[0-9]+}} %
 
 // firstprivate(b)
-// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
-// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
 
-// TCHECK: ret void
 
 #endif
+// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK-NEXT:    ret void
+// CHECK-64-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK-64-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK-64-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK-64-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[GA_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [9 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_PTRS5:%.*]] = alloca [9 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_MAPPERS6:%.*]] = alloca [9 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK-64-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK-64-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK-64-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK-64-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// CHECK-64-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// CHECK-64-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK-64-NEXT:    store i32 [[TMP8]], i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* @ga, align 4
+// CHECK-64-NEXT:    [[CONV2:%.*]] = bitcast i64* [[GA_CASTED]] to i32*
+// CHECK-64-NEXT:    store i32 [[TMP11]], i32* [[CONV2]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i64, i64* [[GA_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP9]], i64* [[TMP14]], align 8
+// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP9]], i64* [[TMP16]], align 8
+// CHECK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP17]], align 8
+// CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK-64-NEXT:    store i32* [[TMP10]], i32** [[TMP19]], align 8
+// CHECK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32**
+// CHECK-64-NEXT:    store i32* [[TMP10]], i32** [[TMP21]], align 8
+// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP22]], align 8
+// CHECK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK-64-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP12]], i64* [[TMP24]], align 8
+// CHECK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK-64-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP12]], i64* [[TMP26]], align 8
+// CHECK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP27]], align 8
+// CHECK-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-64-NEXT:    store i32 2, i32* [[TMP30]], align 4
+// CHECK-64-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-64-NEXT:    store i32 3, i32* [[TMP31]], align 4
+// CHECK-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-64-NEXT:    store i8** [[TMP28]], i8*** [[TMP32]], align 8
+// CHECK-64-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-64-NEXT:    store i8** [[TMP29]], i8*** [[TMP33]], align 8
+// CHECK-64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP34]], align 8
+// CHECK-64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP35]], align 8
+// CHECK-64-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP36]], align 8
+// CHECK-64-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP37]], align 8
+// CHECK-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP38]], align 8
+// CHECK-64-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP39]], align 8
+// CHECK-64-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP40]], align 4
+// CHECK-64-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP41]], align 4
+// CHECK-64-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-64-NEXT:    store i32 0, i32* [[TMP42]], align 4
+// CHECK-64-NEXT:    [[TMP43:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK-64-NEXT:    [[TMP44:%.*]] = icmp ne i32 [[TMP43]], 0
+// CHECK-64-NEXT:    br i1 [[TMP44]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-64:       omp_offload.failed:
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i64 [[TMP9]], i32* [[TMP10]], i64 [[TMP12]]) #[[ATTR3:[0-9]+]]
+// CHECK-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK-64:       omp_offload.cont:
+// CHECK-64-NEXT:    [[TMP45:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK-64-NEXT:    [[CONV3:%.*]] = bitcast i64* [[AA_CASTED]] to i16*
+// CHECK-64-NEXT:    store i16 [[TMP45]], i16* [[CONV3]], align 2
+// CHECK-64-NEXT:    [[TMP46:%.*]] = load i64, i64* [[AA_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP47:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK-64-NEXT:    [[TMP48:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK-64-NEXT:    [[TMP49:%.*]] = mul nuw i64 [[TMP48]], 8
+// CHECK-64-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP50]], i8* align 8 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i64 72, i1 false)
+// CHECK-64-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP46]], i64* [[TMP52]], align 8
+// CHECK-64-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP46]], i64* [[TMP54]], align 8
+// CHECK-64-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 0
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP55]], align 8
+// CHECK-64-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 8
+// CHECK-64-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 8
+// CHECK-64-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 1
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP60]], align 8
+// CHECK-64-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 2
+// CHECK-64-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[TMP62]], align 8
+// CHECK-64-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 2
+// CHECK-64-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[TMP64]], align 8
+// CHECK-64-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 2
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP65]], align 8
+// CHECK-64-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 3
+// CHECK-64-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK-64-NEXT:    store float* [[VLA]], float** [[TMP67]], align 8
+// CHECK-64-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 3
+// CHECK-64-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK-64-NEXT:    store float* [[VLA]], float** [[TMP69]], align 8
+// CHECK-64-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK-64-NEXT:    store i64 [[TMP47]], i64* [[TMP70]], align 8
+// CHECK-64-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 3
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP71]], align 8
+// CHECK-64-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 4
+// CHECK-64-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 8
+// CHECK-64-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 4
+// CHECK-64-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 8
+// CHECK-64-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 4
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP76]], align 8
+// CHECK-64-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 5
+// CHECK-64-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i64*
+// CHECK-64-NEXT:    store i64 5, i64* [[TMP78]], align 8
+// CHECK-64-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 5
+// CHECK-64-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i64*
+// CHECK-64-NEXT:    store i64 5, i64* [[TMP80]], align 8
+// CHECK-64-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 5
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP81]], align 8
+// CHECK-64-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 6
+// CHECK-64-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP4]], i64* [[TMP83]], align 8
+// CHECK-64-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 6
+// CHECK-64-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP4]], i64* [[TMP85]], align 8
+// CHECK-64-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 6
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP86]], align 8
+// CHECK-64-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 7
+// CHECK-64-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK-64-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 8
+// CHECK-64-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 7
+// CHECK-64-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK-64-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 8
+// CHECK-64-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK-64-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 8
+// CHECK-64-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 7
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP92]], align 8
+// CHECK-64-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 8
+// CHECK-64-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 8
+// CHECK-64-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 8
+// CHECK-64-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 8
+// CHECK-64-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 8
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP97]], align 8
+// CHECK-64-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK-64-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK-64-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 0
+// CHECK-64-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK-64-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 1
+// CHECK-64-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK-64-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 2
+// CHECK-64-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 8
+// CHECK-64-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 3
+// CHECK-64-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 8
+// CHECK-64-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 4
+// CHECK-64-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 8
+// CHECK-64-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 5
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 8
+// CHECK-64-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 6
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP107]], align 8
+// CHECK-64-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 7
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP108]], align 8
+// CHECK-64-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 8
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK-64-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 9
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK-64-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 10
+// CHECK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK-64-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 11
+// CHECK-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK-64-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 12
+// CHECK-64-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK-64-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]])
+// CHECK-64-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK-64-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
+// CHECK-64:       omp_offload.failed8:
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i64 [[TMP46]], [10 x float]* [[B]], i64 [[TMP1]], float* [[VLA]], [5 x [10 x double]]* [[C]], i64 5, i64 [[TMP4]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK-64-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
+// CHECK-64:       omp_offload.cont9:
+// CHECK-64-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK-64-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 8
+// CHECK-64-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK-64-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 8
+// CHECK-64-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 0
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP121]], align 8
+// CHECK-64-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 8
+// CHECK-64-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 8
+// CHECK-64-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 1
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP126]], align 8
+// CHECK-64-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK-64-NEXT:    [[KERNEL_ARGS13:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK-64-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 0
+// CHECK-64-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK-64-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 1
+// CHECK-64-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK-64-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 2
+// CHECK-64-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 8
+// CHECK-64-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 3
+// CHECK-64-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 8
+// CHECK-64-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 4
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 8
+// CHECK-64-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 5
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 8
+// CHECK-64-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 6
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP135]], align 8
+// CHECK-64-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 7
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP136]], align 8
+// CHECK-64-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 8
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK-64-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 9
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK-64-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 10
+// CHECK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK-64-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 11
+// CHECK-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK-64-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 12
+// CHECK-64-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK-64-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]])
+// CHECK-64-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK-64-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED14:%.*]], label [[OMP_OFFLOAD_CONT15:%.*]]
+// CHECK-64:       omp_offload.failed14:
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK-64-NEXT:    br label [[OMP_OFFLOAD_CONT15]]
+// CHECK-64:       omp_offload.cont15:
+// CHECK-64-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK-64-NEXT:    ret i32 [[TMP144]]
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK-64-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// CHECK-64-NEXT:    ret void
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK-64-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// CHECK-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// CHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// CHECK-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// CHECK-64-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// CHECK-64-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK-64-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK-64-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// CHECK-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK-64-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// CHECK-64-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// CHECK-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK-64-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// CHECK-64-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK-64-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK-64-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK-64-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK-64-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// CHECK-64-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// CHECK-64-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK-64-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// CHECK-64-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// CHECK-64-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK-64-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// CHECK-64-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK-64-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK-64-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK-64-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK-64-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// CHECK-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// CHECK-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK-64-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// CHECK-64-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// CHECK-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK-64-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// CHECK-64-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// CHECK-64-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// CHECK-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK-64-NEXT:    store i64 1, i64* [[X]], align 8
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK-64-NEXT:    store i8 1, i8* [[Y]], align 8
+// CHECK-64-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK-64-NEXT:    ret void
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK-64-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK-64-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// CHECK-64-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 8, i1 false)
+// CHECK-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 0
+// CHECK-64-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 0
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+// CHECK-64-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK-64-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 8
+// CHECK-64-NEXT:    ret void
+// CHECK-64-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK-64-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// CHECK-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK-64-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    ret i32 [[TMP9]]
+// CHECK-64-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK-64-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 8
+// CHECK-64-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-64-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// CHECK-64-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK-64-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// CHECK-64-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_CASTED]] to i32*
+// CHECK-64-NEXT:    store i32 [[TMP5]], i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[B_CASTED]], align 8
+// CHECK-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP7:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+// CHECK-64-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP9]], i8* align 8 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i64 40, i1 false)
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK-64-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK-64-NEXT:    store double* [[A]], double** [[TMP13]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP14]], align 8
+// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP6]], i64* [[TMP16]], align 8
+// CHECK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP6]], i64* [[TMP18]], align 8
+// CHECK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP19]], align 8
+// CHECK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK-64-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i64*
+// CHECK-64-NEXT:    store i64 2, i64* [[TMP21]], align 8
+// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK-64-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i64*
+// CHECK-64-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP24]], align 8
+// CHECK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK-64-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP2]], i64* [[TMP26]], align 8
+// CHECK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK-64-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP2]], i64* [[TMP28]], align 8
+// CHECK-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP29]], align 8
+// CHECK-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK-64-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK-64-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 8
+// CHECK-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK-64-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK-64-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 8
+// CHECK-64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK-64-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 8
+// CHECK-64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 4
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP35]], align 8
+// CHECK-64-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-64-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-64-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK-64-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-64-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK-64-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-64-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 8
+// CHECK-64-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-64-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 8
+// CHECK-64-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-64-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 8
+// CHECK-64-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 8
+// CHECK-64-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP45]], align 8
+// CHECK-64-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP46]], align 8
+// CHECK-64-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK-64-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK-64-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK-64-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK-64-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-64-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK-64-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK-64-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK-64-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-64:       omp_offload.failed:
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK-64:       omp_offload.cont:
+// CHECK-64-NEXT:    [[TMP54:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP54]]
+// CHECK-64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK-64-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK-64-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK-64-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], [[TMP56]]
+// CHECK-64-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK-64-NEXT:    ret i32 [[ADD4]]
+// CHECK-64-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK-64-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK-64-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK-64-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK-64-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_CASTED]] to i8*
+// CHECK-64-NEXT:    store i8 [[TMP2]], i8* [[CONV1]], align 1
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[AAA_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[TMP7]], align 8
+// CHECK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP8]], align 8
+// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP3]], i64* [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP3]], i64* [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP13]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK-64-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 8
+// CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK-64-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 8
+// CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP18]], align 8
+// CHECK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-64-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-64-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-64-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 8
+// CHECK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-64-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 8
+// CHECK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 8
+// CHECK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 8
+// CHECK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP27]], align 8
+// CHECK-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP28]], align 8
+// CHECK-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK-64-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK-64-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-64-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK-64-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK-64-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK-64-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-64:       omp_offload.failed:
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i64 [[TMP1]], i64 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK-64:       omp_offload.cont:
+// CHECK-64-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    ret i32 [[TMP36]]
+// CHECK-64-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK-64-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK-64-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[TMP3]], align 8
+// CHECK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP6]], align 8
+// CHECK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 8
+// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-64-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK-64-NEXT:    store i8* null, i8** [[TMP11]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-64-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-64-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-64-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 8
+// CHECK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-64-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 8
+// CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 8
+// CHECK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 8
+// CHECK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP20]], align 8
+// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-64-NEXT:    store i8** null, i8*** [[TMP21]], align 8
+// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-64-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-64-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK-64-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK-64-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK-64-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-64:       omp_offload.failed:
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i64 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK-64:       omp_offload.cont:
+// CHECK-64-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT:    ret i32 [[TMP29]]
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK-64-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// CHECK-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK-64-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK-64-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK-64-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// CHECK-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK-64-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// CHECK-64-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK-64-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK-64-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// CHECK-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK-64-NEXT:    store double [[ADD]], double* [[A]], align 8
+// CHECK-64-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// CHECK-64-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK-64-NEXT:    store double [[INC]], double* [[A5]], align 8
+// CHECK-64-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// CHECK-64-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// CHECK-64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK-64-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK-64-NEXT:    ret void
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK-64-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// CHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// CHECK-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// CHECK-64-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// CHECK-64-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// CHECK-64-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK-64-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// CHECK-64-NEXT:    ret void
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK-64-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK-64-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK-64-NEXT:    ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-64-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK-64-NEXT:    ret void
+// CHECK-32-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK-32-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK-32-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK-32-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[GA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [9 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [9 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [9 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS8:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_PTRS9:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_MAPPERS10:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK-32-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// CHECK-32-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK-32-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// CHECK-32-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// CHECK-32-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// CHECK-32-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP6]], i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* @ga, align 4
+// CHECK-32-NEXT:    store i32 [[TMP9]], i32* [[GA_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[GA_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[TMP12]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[TMP14]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP15]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to i32**
+// CHECK-32-NEXT:    store i32* [[TMP8]], i32** [[TMP17]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK-32-NEXT:    store i32* [[TMP8]], i32** [[TMP19]], align 4
+// CHECK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP20]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP10]], i32* [[TMP22]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP10]], i32* [[TMP24]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP25]], align 4
+// CHECK-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP28]], align 4
+// CHECK-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-32-NEXT:    store i32 3, i32* [[TMP29]], align 4
+// CHECK-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8** [[TMP26]], i8*** [[TMP30]], align 4
+// CHECK-32-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-32-NEXT:    store i8** [[TMP27]], i8*** [[TMP31]], align 4
+// CHECK-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP32]], align 4
+// CHECK-32-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP33]], align 4
+// CHECK-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP34]], align 4
+// CHECK-32-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP35]], align 4
+// CHECK-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP36]], align 8
+// CHECK-32-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP37]], align 8
+// CHECK-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP38]], align 4
+// CHECK-32-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP39]], align 4
+// CHECK-32-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-32-NEXT:    store i32 0, i32* [[TMP40]], align 4
+// CHECK-32-NEXT:    [[TMP41:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK-32-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
+// CHECK-32-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-32:       omp_offload.failed:
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i32 [[TMP7]], i32* [[TMP8]], i32 [[TMP10]]) #[[ATTR3:[0-9]+]]
+// CHECK-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK-32:       omp_offload.cont:
+// CHECK-32-NEXT:    [[TMP43:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_CASTED]] to i16*
+// CHECK-32-NEXT:    store i16 [[TMP43]], i16* [[CONV]], align 2
+// CHECK-32-NEXT:    [[TMP44:%.*]] = load i32, i32* [[AA_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP45:%.*]] = mul nuw i32 [[TMP0]], 4
+// CHECK-32-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP45]] to i64
+// CHECK-32-NEXT:    [[TMP47:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK-32-NEXT:    [[TMP48:%.*]] = mul nuw i32 [[TMP47]], 8
+// CHECK-32-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP48]] to i64
+// CHECK-32-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP50]], i8* align 4 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i32 72, i1 false)
+// CHECK-32-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP44]], i32* [[TMP52]], align 4
+// CHECK-32-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP44]], i32* [[TMP54]], align 4
+// CHECK-32-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP55]], align 4
+// CHECK-32-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 4
+// CHECK-32-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 4
+// CHECK-32-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 1
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP60]], align 4
+// CHECK-32-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[TMP62]], align 4
+// CHECK-32-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[TMP64]], align 4
+// CHECK-32-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP65]], align 4
+// CHECK-32-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 3
+// CHECK-32-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK-32-NEXT:    store float* [[VLA]], float** [[TMP67]], align 4
+// CHECK-32-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 3
+// CHECK-32-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK-32-NEXT:    store float* [[VLA]], float** [[TMP69]], align 4
+// CHECK-32-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK-32-NEXT:    store i64 [[TMP46]], i64* [[TMP70]], align 4
+// CHECK-32-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 3
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP71]], align 4
+// CHECK-32-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 4
+// CHECK-32-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 4
+// CHECK-32-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 4
+// CHECK-32-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 4
+// CHECK-32-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 4
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP76]], align 4
+// CHECK-32-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 5
+// CHECK-32-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i32*
+// CHECK-32-NEXT:    store i32 5, i32* [[TMP78]], align 4
+// CHECK-32-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 5
+// CHECK-32-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i32*
+// CHECK-32-NEXT:    store i32 5, i32* [[TMP80]], align 4
+// CHECK-32-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 5
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP81]], align 4
+// CHECK-32-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 6
+// CHECK-32-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP2]], i32* [[TMP83]], align 4
+// CHECK-32-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 6
+// CHECK-32-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP2]], i32* [[TMP85]], align 4
+// CHECK-32-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 6
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP86]], align 4
+// CHECK-32-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 7
+// CHECK-32-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK-32-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 4
+// CHECK-32-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 7
+// CHECK-32-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK-32-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 4
+// CHECK-32-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK-32-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 4
+// CHECK-32-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 7
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP92]], align 4
+// CHECK-32-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 8
+// CHECK-32-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 4
+// CHECK-32-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 8
+// CHECK-32-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 4
+// CHECK-32-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 8
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP97]], align 4
+// CHECK-32-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK-32-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK-32-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK-32-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK-32-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK-32-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 4
+// CHECK-32-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK-32-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 4
+// CHECK-32-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK-32-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 4
+// CHECK-32-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 4
+// CHECK-32-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP107]], align 4
+// CHECK-32-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP108]], align 4
+// CHECK-32-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK-32-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK-32-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK-32-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK-32-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK-32-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK-32-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]])
+// CHECK-32-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK-32-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK-32:       omp_offload.failed6:
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i32 [[TMP44]], [10 x float]* [[B]], i32 [[TMP0]], float* [[VLA]], [5 x [10 x double]]* [[C]], i32 5, i32 [[TMP2]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK-32-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK-32:       omp_offload.cont7:
+// CHECK-32-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK-32-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 4
+// CHECK-32-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK-32-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 4
+// CHECK-32-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 0
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP121]], align 4
+// CHECK-32-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 4
+// CHECK-32-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 4
+// CHECK-32-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 1
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP126]], align 4
+// CHECK-32-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK-32-NEXT:    [[KERNEL_ARGS11:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK-32-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 0
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK-32-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 1
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK-32-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 4
+// CHECK-32-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 3
+// CHECK-32-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 4
+// CHECK-32-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 4
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 4
+// CHECK-32-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 5
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 4
+// CHECK-32-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 6
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP135]], align 4
+// CHECK-32-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 7
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP136]], align 4
+// CHECK-32-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 8
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK-32-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 9
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK-32-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 10
+// CHECK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK-32-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 11
+// CHECK-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK-32-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 12
+// CHECK-32-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK-32-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]])
+// CHECK-32-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK-32-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED12:%.*]], label [[OMP_OFFLOAD_CONT13:%.*]]
+// CHECK-32:       omp_offload.failed12:
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK-32-NEXT:    br label [[OMP_OFFLOAD_CONT13]]
+// CHECK-32:       omp_offload.cont13:
+// CHECK-32-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK-32-NEXT:    ret i32 [[TMP144]]
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK-32-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// CHECK-32-NEXT:    ret void
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK-32-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// CHECK-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// CHECK-32-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// CHECK-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// CHECK-32-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// CHECK-32-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK-32-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK-32-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// CHECK-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK-32-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// CHECK-32-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// CHECK-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK-32-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// CHECK-32-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK-32-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK-32-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// CHECK-32-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK-32-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// CHECK-32-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// CHECK-32-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK-32-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// CHECK-32-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK-32-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK-32-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK-32-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK-32-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// CHECK-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK-32-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// CHECK-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK-32-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// CHECK-32-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// CHECK-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK-32-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// CHECK-32-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// CHECK-32-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// CHECK-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK-32-NEXT:    store i64 1, i64* [[X]], align 4
+// CHECK-32-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK-32-NEXT:    store i8 1, i8* [[Y]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK-32-NEXT:    ret void
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK-32-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK-32-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// CHECK-32-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 8, i1 false)
+// CHECK-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i32 0
+// CHECK-32-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i32 0
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 4
+// CHECK-32-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK-32-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 4
+// CHECK-32-NEXT:    ret void
+// CHECK-32-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK-32-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// CHECK-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK-32-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    ret i32 [[TMP9]]
+// CHECK-32-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK-32-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 4
+// CHECK-32-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK-32-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK-32-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[B_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B_CASTED]], align 4
+// CHECK-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP6:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK-32-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+// CHECK-32-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i32 40, i1 false)
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK-32-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK-32-NEXT:    store double* [[A]], double** [[TMP13]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP14]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP5]], i32* [[TMP16]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP5]], i32* [[TMP18]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP19]], align 4
+// CHECK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32*
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i32*
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP23]], align 4
+// CHECK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP24]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK-32-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[TMP26]], align 4
+// CHECK-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK-32-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[TMP28]], align 4
+// CHECK-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP29]], align 4
+// CHECK-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK-32-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK-32-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 4
+// CHECK-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK-32-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK-32-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 4
+// CHECK-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK-32-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 4
+// CHECK-32-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 4
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP35]], align 4
+// CHECK-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-32-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK-32-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-32-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK-32-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 4
+// CHECK-32-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-32-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 4
+// CHECK-32-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-32-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 4
+// CHECK-32-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 4
+// CHECK-32-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP45]], align 4
+// CHECK-32-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP46]], align 4
+// CHECK-32-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK-32-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK-32-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK-32-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK-32-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-32-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK-32-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK-32-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK-32-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-32:       omp_offload.failed:
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK-32:       omp_offload.cont:
+// CHECK-32-NEXT:    [[TMP54:%.*]] = mul nsw i32 1, [[TMP1]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP54]]
+// CHECK-32-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK-32-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK-32-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], [[TMP56]]
+// CHECK-32-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK-32-NEXT:    ret i32 [[ADD3]]
+// CHECK-32-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK-32-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK-32-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK-32-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_CASTED]] to i8*
+// CHECK-32-NEXT:    store i8 [[TMP2]], i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[AAA_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[TMP7]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[TMP12]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP13]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP18]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-32-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 4
+// CHECK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-32-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 4
+// CHECK-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 4
+// CHECK-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP27]], align 4
+// CHECK-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP28]], align 4
+// CHECK-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK-32-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK-32-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-32-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK-32-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK-32-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK-32-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-32:       omp_offload.failed:
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i32 [[TMP1]], i32 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK-32:       omp_offload.cont:
+// CHECK-32-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    ret i32 [[TMP36]]
+// CHECK-32-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK-32-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[TMP3]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP6]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK-32-NEXT:    store i8* null, i8** [[TMP11]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-32-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-32-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-32-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 4
+// CHECK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP20]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-32-NEXT:    store i8** null, i8*** [[TMP21]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-32-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-32-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK-32-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK-32-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK-32-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-32:       omp_offload.failed:
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i32 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK-32:       omp_offload.cont:
+// CHECK-32-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT:    ret i32 [[TMP29]]
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK-32-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// CHECK-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK-32-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK-32-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK-32-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK-32-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK-32-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK-32-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK-32-NEXT:    store double [[ADD]], double* [[A]], align 4
+// CHECK-32-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// CHECK-32-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK-32-NEXT:    store double [[INC]], double* [[A4]], align 4
+// CHECK-32-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// CHECK-32-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// CHECK-32-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK-32-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK-32-NEXT:    ret void
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// CHECK-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// CHECK-32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// CHECK-32-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// CHECK-32-NEXT:    ret void
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK-32-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK-32-NEXT:    ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-32-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK-32-NEXT:    ret void
+// CHECK0-64-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK0-64-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK0-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK0-64-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK0-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK0-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK0-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK0-64-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK0-64-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK0-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[GA_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [9 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_PTRS5:%.*]] = alloca [9 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_MAPPERS6:%.*]] = alloca [9 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK0-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK0-64-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK0-64-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// CHECK0-64-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK0-64-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// CHECK0-64-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK0-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// CHECK0-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// CHECK0-64-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK0-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK0-64-NEXT:    store i32 [[TMP8]], i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[TMP9:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK0-64-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK0-64-NEXT:    [[TMP11:%.*]] = load i32, i32* @ga, align 4
+// CHECK0-64-NEXT:    [[CONV2:%.*]] = bitcast i64* [[GA_CASTED]] to i32*
+// CHECK0-64-NEXT:    store i32 [[TMP11]], i32* [[CONV2]], align 4
+// CHECK0-64-NEXT:    [[TMP12:%.*]] = load i64, i64* [[GA_CASTED]], align 8
+// CHECK0-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP9]], i64* [[TMP14]], align 8
+// CHECK0-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP9]], i64* [[TMP16]], align 8
+// CHECK0-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP17]], align 8
+// CHECK0-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK0-64-NEXT:    store i32* [[TMP10]], i32** [[TMP19]], align 8
+// CHECK0-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32**
+// CHECK0-64-NEXT:    store i32* [[TMP10]], i32** [[TMP21]], align 8
+// CHECK0-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP22]], align 8
+// CHECK0-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK0-64-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP12]], i64* [[TMP24]], align 8
+// CHECK0-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK0-64-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP12]], i64* [[TMP26]], align 8
+// CHECK0-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP27]], align 8
+// CHECK0-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK0-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK0-64-NEXT:    store i32 2, i32* [[TMP30]], align 4
+// CHECK0-64-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK0-64-NEXT:    store i32 3, i32* [[TMP31]], align 4
+// CHECK0-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK0-64-NEXT:    store i8** [[TMP28]], i8*** [[TMP32]], align 8
+// CHECK0-64-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK0-64-NEXT:    store i8** [[TMP29]], i8*** [[TMP33]], align 8
+// CHECK0-64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP34]], align 8
+// CHECK0-64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP35]], align 8
+// CHECK0-64-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP36]], align 8
+// CHECK0-64-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP37]], align 8
+// CHECK0-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP38]], align 8
+// CHECK0-64-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP39]], align 8
+// CHECK0-64-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK0-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP40]], align 4
+// CHECK0-64-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK0-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP41]], align 4
+// CHECK0-64-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK0-64-NEXT:    store i32 0, i32* [[TMP42]], align 4
+// CHECK0-64-NEXT:    [[TMP43:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK0-64-NEXT:    [[TMP44:%.*]] = icmp ne i32 [[TMP43]], 0
+// CHECK0-64-NEXT:    br i1 [[TMP44]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK0-64:       omp_offload.failed:
+// CHECK0-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i64 [[TMP9]], i32* [[TMP10]], i64 [[TMP12]]) #[[ATTR3:[0-9]+]]
+// CHECK0-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK0-64:       omp_offload.cont:
+// CHECK0-64-NEXT:    [[TMP45:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK0-64-NEXT:    [[CONV3:%.*]] = bitcast i64* [[AA_CASTED]] to i16*
+// CHECK0-64-NEXT:    store i16 [[TMP45]], i16* [[CONV3]], align 2
+// CHECK0-64-NEXT:    [[TMP46:%.*]] = load i64, i64* [[AA_CASTED]], align 8
+// CHECK0-64-NEXT:    [[TMP47:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK0-64-NEXT:    [[TMP48:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK0-64-NEXT:    [[TMP49:%.*]] = mul nuw i64 [[TMP48]], 8
+// CHECK0-64-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP50]], i8* align 8 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i64 72, i1 false)
+// CHECK0-64-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP46]], i64* [[TMP52]], align 8
+// CHECK0-64-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP46]], i64* [[TMP54]], align 8
+// CHECK0-64-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 0
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP55]], align 8
+// CHECK0-64-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK0-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 8
+// CHECK0-64-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK0-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 8
+// CHECK0-64-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 1
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP60]], align 8
+// CHECK0-64-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 2
+// CHECK0-64-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[TMP62]], align 8
+// CHECK0-64-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 2
+// CHECK0-64-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[TMP64]], align 8
+// CHECK0-64-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 2
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP65]], align 8
+// CHECK0-64-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 3
+// CHECK0-64-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK0-64-NEXT:    store float* [[VLA]], float** [[TMP67]], align 8
+// CHECK0-64-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 3
+// CHECK0-64-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK0-64-NEXT:    store float* [[VLA]], float** [[TMP69]], align 8
+// CHECK0-64-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK0-64-NEXT:    store i64 [[TMP47]], i64* [[TMP70]], align 8
+// CHECK0-64-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 3
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP71]], align 8
+// CHECK0-64-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 4
+// CHECK0-64-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK0-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 8
+// CHECK0-64-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 4
+// CHECK0-64-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK0-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 8
+// CHECK0-64-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 4
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP76]], align 8
+// CHECK0-64-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 5
+// CHECK0-64-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i64*
+// CHECK0-64-NEXT:    store i64 5, i64* [[TMP78]], align 8
+// CHECK0-64-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 5
+// CHECK0-64-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i64*
+// CHECK0-64-NEXT:    store i64 5, i64* [[TMP80]], align 8
+// CHECK0-64-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 5
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP81]], align 8
+// CHECK0-64-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 6
+// CHECK0-64-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP4]], i64* [[TMP83]], align 8
+// CHECK0-64-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 6
+// CHECK0-64-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP4]], i64* [[TMP85]], align 8
+// CHECK0-64-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 6
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP86]], align 8
+// CHECK0-64-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 7
+// CHECK0-64-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK0-64-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 8
+// CHECK0-64-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 7
+// CHECK0-64-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK0-64-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 8
+// CHECK0-64-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK0-64-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 8
+// CHECK0-64-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 7
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP92]], align 8
+// CHECK0-64-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 8
+// CHECK0-64-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK0-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 8
+// CHECK0-64-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 8
+// CHECK0-64-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK0-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 8
+// CHECK0-64-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 8
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP97]], align 8
+// CHECK0-64-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK0-64-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 0
+// CHECK0-64-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK0-64-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 1
+// CHECK0-64-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK0-64-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 2
+// CHECK0-64-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 8
+// CHECK0-64-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 3
+// CHECK0-64-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 8
+// CHECK0-64-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 4
+// CHECK0-64-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 8
+// CHECK0-64-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 5
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 8
+// CHECK0-64-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 6
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP107]], align 8
+// CHECK0-64-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 7
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP108]], align 8
+// CHECK0-64-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 8
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK0-64-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 9
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK0-64-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 10
+// CHECK0-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK0-64-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 11
+// CHECK0-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK0-64-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 12
+// CHECK0-64-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK0-64-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]])
+// CHECK0-64-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK0-64-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
+// CHECK0-64:       omp_offload.failed8:
+// CHECK0-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i64 [[TMP46]], [10 x float]* [[B]], i64 [[TMP1]], float* [[VLA]], [5 x [10 x double]]* [[C]], i64 5, i64 [[TMP4]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK0-64-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
+// CHECK0-64:       omp_offload.cont9:
+// CHECK0-64-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK0-64-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 8
+// CHECK0-64-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK0-64-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 8
+// CHECK0-64-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 0
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP121]], align 8
+// CHECK0-64-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK0-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 8
+// CHECK0-64-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK0-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 8
+// CHECK0-64-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 1
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP126]], align 8
+// CHECK0-64-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[KERNEL_ARGS13:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK0-64-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 0
+// CHECK0-64-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK0-64-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 1
+// CHECK0-64-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK0-64-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 2
+// CHECK0-64-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 8
+// CHECK0-64-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 3
+// CHECK0-64-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 8
+// CHECK0-64-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 4
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 8
+// CHECK0-64-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 5
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 8
+// CHECK0-64-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 6
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP135]], align 8
+// CHECK0-64-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 7
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP136]], align 8
+// CHECK0-64-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 8
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK0-64-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 9
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK0-64-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 10
+// CHECK0-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK0-64-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 11
+// CHECK0-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK0-64-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 12
+// CHECK0-64-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK0-64-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]])
+// CHECK0-64-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK0-64-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED14:%.*]], label [[OMP_OFFLOAD_CONT15:%.*]]
+// CHECK0-64:       omp_offload.failed14:
+// CHECK0-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK0-64-NEXT:    br label [[OMP_OFFLOAD_CONT15]]
+// CHECK0-64:       omp_offload.cont15:
+// CHECK0-64-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK0-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK0-64-NEXT:    ret i32 [[TMP144]]
+// CHECK0-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK0-64-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// CHECK0-64-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK0-64-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// CHECK0-64-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK0-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// CHECK0-64-NEXT:    ret void
+// CHECK0-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK0-64-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// CHECK0-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// CHECK0-64-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// CHECK0-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// CHECK0-64-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// CHECK0-64-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK0-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK0-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK0-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK0-64-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// CHECK0-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// CHECK0-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK0-64-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// CHECK0-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK0-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK0-64-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// CHECK0-64-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// CHECK0-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// CHECK0-64-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK0-64-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// CHECK0-64-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK0-64-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// CHECK0-64-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK0-64-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK0-64-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK0-64-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// CHECK0-64-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK0-64-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// CHECK0-64-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK0-64-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// CHECK0-64-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK0-64-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// CHECK0-64-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK0-64-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// CHECK0-64-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK0-64-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// CHECK0-64-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK0-64-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// CHECK0-64-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK0-64-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK0-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK0-64-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK0-64-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK0-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// CHECK0-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK0-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// CHECK0-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK0-64-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// CHECK0-64-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// CHECK0-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK0-64-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// CHECK0-64-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// CHECK0-64-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// CHECK0-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK0-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK0-64-NEXT:    store i64 1, i64* [[X]], align 8
+// CHECK0-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK0-64-NEXT:    store i8 1, i8* [[Y]], align 8
+// CHECK0-64-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK0-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK0-64-NEXT:    ret void
+// CHECK0-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK0-64-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK0-64-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// CHECK0-64-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK0-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK0-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 8, i1 false)
+// CHECK0-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK0-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK0-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 0
+// CHECK0-64-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK0-64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 0
+// CHECK0-64-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+// CHECK0-64-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK0-64-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 8
+// CHECK0-64-NEXT:    ret void
+// CHECK0-64-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK0-64-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK0-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// CHECK0-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK0-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK0-64-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK0-64-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK0-64-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// CHECK0-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK0-64-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// CHECK0-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK0-64-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    ret i32 [[TMP9]]
+// CHECK0-64-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK0-64-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK0-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK0-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 8
+// CHECK0-64-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK0-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK0-64-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// CHECK0-64-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK0-64-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// CHECK0-64-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_CASTED]] to i32*
+// CHECK0-64-NEXT:    store i32 [[TMP5]], i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[B_CASTED]], align 8
+// CHECK0-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP7:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK0-64-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+// CHECK0-64-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP9]], i8* align 8 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i64 40, i1 false)
+// CHECK0-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK0-64-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 8
+// CHECK0-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK0-64-NEXT:    store double* [[A]], double** [[TMP13]], align 8
+// CHECK0-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP14]], align 8
+// CHECK0-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP6]], i64* [[TMP16]], align 8
+// CHECK0-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP6]], i64* [[TMP18]], align 8
+// CHECK0-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP19]], align 8
+// CHECK0-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK0-64-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i64*
+// CHECK0-64-NEXT:    store i64 2, i64* [[TMP21]], align 8
+// CHECK0-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK0-64-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i64*
+// CHECK0-64-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK0-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP24]], align 8
+// CHECK0-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK0-64-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP2]], i64* [[TMP26]], align 8
+// CHECK0-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK0-64-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP2]], i64* [[TMP28]], align 8
+// CHECK0-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP29]], align 8
+// CHECK0-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK0-64-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK0-64-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 8
+// CHECK0-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK0-64-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK0-64-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 8
+// CHECK0-64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK0-64-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 8
+// CHECK0-64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 4
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP35]], align 8
+// CHECK0-64-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK0-64-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK0-64-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK0-64-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK0-64-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK0-64-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK0-64-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 8
+// CHECK0-64-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK0-64-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 8
+// CHECK0-64-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK0-64-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 8
+// CHECK0-64-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 8
+// CHECK0-64-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP45]], align 8
+// CHECK0-64-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP46]], align 8
+// CHECK0-64-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK0-64-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK0-64-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK0-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK0-64-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK0-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK0-64-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK0-64-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK0-64-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK0-64-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK0-64-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK0-64:       omp_offload.failed:
+// CHECK0-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK0-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK0-64:       omp_offload.cont:
+// CHECK0-64-NEXT:    [[TMP54:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK0-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP54]]
+// CHECK0-64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK0-64-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK0-64-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK0-64-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK0-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], [[TMP56]]
+// CHECK0-64-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK0-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK0-64-NEXT:    ret i32 [[ADD4]]
+// CHECK0-64-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK0-64-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK0-64-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK0-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK0-64-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK0-64-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK0-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_CASTED]] to i8*
+// CHECK0-64-NEXT:    store i8 [[TMP2]], i8* [[CONV1]], align 1
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[AAA_CASTED]], align 8
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK0-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[TMP7]], align 8
+// CHECK0-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP8]], align 8
+// CHECK0-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP3]], i64* [[TMP10]], align 8
+// CHECK0-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP3]], i64* [[TMP12]], align 8
+// CHECK0-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP13]], align 8
+// CHECK0-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK0-64-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK0-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 8
+// CHECK0-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK0-64-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK0-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 8
+// CHECK0-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP18]], align 8
+// CHECK0-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK0-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK0-64-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK0-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK0-64-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK0-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK0-64-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 8
+// CHECK0-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK0-64-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 8
+// CHECK0-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 8
+// CHECK0-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 8
+// CHECK0-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP27]], align 8
+// CHECK0-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP28]], align 8
+// CHECK0-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK0-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK0-64-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK0-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK0-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK0-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK0-64-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK0-64-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK0-64-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK0-64-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK0-64-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK0-64:       omp_offload.failed:
+// CHECK0-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i64 [[TMP1]], i64 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK0-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK0-64:       omp_offload.cont:
+// CHECK0-64-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    ret i32 [[TMP36]]
+// CHECK0-64-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK0-64-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK0-64-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK0-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK0-64-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[TMP3]], align 8
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK0-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP6]], align 8
+// CHECK0-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK0-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 8
+// CHECK0-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK0-64-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK0-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 8
+// CHECK0-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK0-64-NEXT:    store i8* null, i8** [[TMP11]], align 8
+// CHECK0-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK0-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK0-64-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK0-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK0-64-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK0-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK0-64-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 8
+// CHECK0-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK0-64-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 8
+// CHECK0-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 8
+// CHECK0-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK0-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 8
+// CHECK0-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP20]], align 8
+// CHECK0-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK0-64-NEXT:    store i8** null, i8*** [[TMP21]], align 8
+// CHECK0-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK0-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK0-64-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK0-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK0-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK0-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK0-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK0-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK0-64-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK0-64-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK0-64-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK0-64-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK0-64:       omp_offload.failed:
+// CHECK0-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i64 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK0-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK0-64:       omp_offload.cont:
+// CHECK0-64-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-64-NEXT:    ret i32 [[TMP29]]
+// CHECK0-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK0-64-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK0-64-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// CHECK0-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK0-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK0-64-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// CHECK0-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK0-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK0-64-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK0-64-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK0-64-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// CHECK0-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK0-64-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// CHECK0-64-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK0-64-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// CHECK0-64-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK0-64-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// CHECK0-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK0-64-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// CHECK0-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK0-64-NEXT:    store double [[ADD]], double* [[A]], align 8
+// CHECK0-64-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK0-64-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// CHECK0-64-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK0-64-NEXT:    store double [[INC]], double* [[A5]], align 8
+// CHECK0-64-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// CHECK0-64-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK0-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// CHECK0-64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK0-64-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// CHECK0-64-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK0-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK0-64-NEXT:    ret void
+// CHECK0-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK0-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK0-64-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// CHECK0-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK0-64-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// CHECK0-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK0-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK0-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// CHECK0-64-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK0-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// CHECK0-64-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// CHECK0-64-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// CHECK0-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// CHECK0-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK0-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK0-64-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// CHECK0-64-NEXT:    ret void
+// CHECK0-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK0-64-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK0-64-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK0-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK0-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK0-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK0-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK0-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK0-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK0-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK0-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK0-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK0-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// CHECK0-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK0-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK0-64-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK0-64-NEXT:    ret void
+// CHECK0-64-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK0-64-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK0-64-NEXT:  entry:
+// CHECK0-64-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK0-64-NEXT:    ret void
+// CHECK1-64-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK1-64-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK1-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK1-64-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK1-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK1-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK1-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK1-64-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK1-64-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK1-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[GA_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [9 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_PTRS5:%.*]] = alloca [9 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS6:%.*]] = alloca [9 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK1-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK1-64-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK1-64-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// CHECK1-64-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK1-64-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// CHECK1-64-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK1-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// CHECK1-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// CHECK1-64-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK1-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK1-64-NEXT:    store i32 [[TMP8]], i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[TMP9:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK1-64-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK1-64-NEXT:    [[TMP11:%.*]] = load i32, i32* @ga, align 4
+// CHECK1-64-NEXT:    [[CONV2:%.*]] = bitcast i64* [[GA_CASTED]] to i32*
+// CHECK1-64-NEXT:    store i32 [[TMP11]], i32* [[CONV2]], align 4
+// CHECK1-64-NEXT:    [[TMP12:%.*]] = load i64, i64* [[GA_CASTED]], align 8
+// CHECK1-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP9]], i64* [[TMP14]], align 8
+// CHECK1-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP9]], i64* [[TMP16]], align 8
+// CHECK1-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP17]], align 8
+// CHECK1-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK1-64-NEXT:    store i32* [[TMP10]], i32** [[TMP19]], align 8
+// CHECK1-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32**
+// CHECK1-64-NEXT:    store i32* [[TMP10]], i32** [[TMP21]], align 8
+// CHECK1-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP22]], align 8
+// CHECK1-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-64-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP12]], i64* [[TMP24]], align 8
+// CHECK1-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-64-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP12]], i64* [[TMP26]], align 8
+// CHECK1-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP27]], align 8
+// CHECK1-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-64-NEXT:    store i32 2, i32* [[TMP30]], align 4
+// CHECK1-64-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-64-NEXT:    store i32 3, i32* [[TMP31]], align 4
+// CHECK1-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-64-NEXT:    store i8** [[TMP28]], i8*** [[TMP32]], align 8
+// CHECK1-64-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-64-NEXT:    store i8** [[TMP29]], i8*** [[TMP33]], align 8
+// CHECK1-64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP34]], align 8
+// CHECK1-64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP35]], align 8
+// CHECK1-64-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP36]], align 8
+// CHECK1-64-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP37]], align 8
+// CHECK1-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP38]], align 8
+// CHECK1-64-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP39]], align 8
+// CHECK1-64-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP40]], align 4
+// CHECK1-64-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP41]], align 4
+// CHECK1-64-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-64-NEXT:    store i32 0, i32* [[TMP42]], align 4
+// CHECK1-64-NEXT:    [[TMP43:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-64-NEXT:    [[TMP44:%.*]] = icmp ne i32 [[TMP43]], 0
+// CHECK1-64-NEXT:    br i1 [[TMP44]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1-64:       omp_offload.failed:
+// CHECK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i64 [[TMP9]], i32* [[TMP10]], i64 [[TMP12]]) #[[ATTR3:[0-9]+]]
+// CHECK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1-64:       omp_offload.cont:
+// CHECK1-64-NEXT:    [[TMP45:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK1-64-NEXT:    [[CONV3:%.*]] = bitcast i64* [[AA_CASTED]] to i16*
+// CHECK1-64-NEXT:    store i16 [[TMP45]], i16* [[CONV3]], align 2
+// CHECK1-64-NEXT:    [[TMP46:%.*]] = load i64, i64* [[AA_CASTED]], align 8
+// CHECK1-64-NEXT:    [[TMP47:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK1-64-NEXT:    [[TMP48:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK1-64-NEXT:    [[TMP49:%.*]] = mul nuw i64 [[TMP48]], 8
+// CHECK1-64-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP50]], i8* align 8 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i64 72, i1 false)
+// CHECK1-64-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP46]], i64* [[TMP52]], align 8
+// CHECK1-64-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP46]], i64* [[TMP54]], align 8
+// CHECK1-64-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 0
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP55]], align 8
+// CHECK1-64-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK1-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 8
+// CHECK1-64-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK1-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 8
+// CHECK1-64-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 1
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP60]], align 8
+// CHECK1-64-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 2
+// CHECK1-64-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[TMP62]], align 8
+// CHECK1-64-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 2
+// CHECK1-64-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[TMP64]], align 8
+// CHECK1-64-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 2
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP65]], align 8
+// CHECK1-64-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 3
+// CHECK1-64-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK1-64-NEXT:    store float* [[VLA]], float** [[TMP67]], align 8
+// CHECK1-64-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 3
+// CHECK1-64-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK1-64-NEXT:    store float* [[VLA]], float** [[TMP69]], align 8
+// CHECK1-64-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK1-64-NEXT:    store i64 [[TMP47]], i64* [[TMP70]], align 8
+// CHECK1-64-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 3
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP71]], align 8
+// CHECK1-64-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 4
+// CHECK1-64-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK1-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 8
+// CHECK1-64-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 4
+// CHECK1-64-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK1-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 8
+// CHECK1-64-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 4
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP76]], align 8
+// CHECK1-64-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 5
+// CHECK1-64-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i64*
+// CHECK1-64-NEXT:    store i64 5, i64* [[TMP78]], align 8
+// CHECK1-64-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 5
+// CHECK1-64-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i64*
+// CHECK1-64-NEXT:    store i64 5, i64* [[TMP80]], align 8
+// CHECK1-64-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 5
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP81]], align 8
+// CHECK1-64-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 6
+// CHECK1-64-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP4]], i64* [[TMP83]], align 8
+// CHECK1-64-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 6
+// CHECK1-64-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP4]], i64* [[TMP85]], align 8
+// CHECK1-64-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 6
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP86]], align 8
+// CHECK1-64-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 7
+// CHECK1-64-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK1-64-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 8
+// CHECK1-64-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 7
+// CHECK1-64-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK1-64-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 8
+// CHECK1-64-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK1-64-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 8
+// CHECK1-64-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 7
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP92]], align 8
+// CHECK1-64-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 8
+// CHECK1-64-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK1-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 8
+// CHECK1-64-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 8
+// CHECK1-64-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK1-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 8
+// CHECK1-64-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 8
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP97]], align 8
+// CHECK1-64-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-64-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 0
+// CHECK1-64-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK1-64-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 1
+// CHECK1-64-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK1-64-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 2
+// CHECK1-64-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 8
+// CHECK1-64-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 3
+// CHECK1-64-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 8
+// CHECK1-64-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 4
+// CHECK1-64-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 8
+// CHECK1-64-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 5
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 8
+// CHECK1-64-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 6
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP107]], align 8
+// CHECK1-64-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 7
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP108]], align 8
+// CHECK1-64-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 8
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK1-64-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 9
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK1-64-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 10
+// CHECK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK1-64-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 11
+// CHECK1-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK1-64-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 12
+// CHECK1-64-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK1-64-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]])
+// CHECK1-64-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK1-64-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
+// CHECK1-64:       omp_offload.failed8:
+// CHECK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i64 [[TMP46]], [10 x float]* [[B]], i64 [[TMP1]], float* [[VLA]], [5 x [10 x double]]* [[C]], i64 5, i64 [[TMP4]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
+// CHECK1-64:       omp_offload.cont9:
+// CHECK1-64-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK1-64-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 8
+// CHECK1-64-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK1-64-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 8
+// CHECK1-64-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 0
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP121]], align 8
+// CHECK1-64-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK1-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 8
+// CHECK1-64-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK1-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 8
+// CHECK1-64-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 1
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP126]], align 8
+// CHECK1-64-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[KERNEL_ARGS13:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-64-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 0
+// CHECK1-64-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK1-64-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 1
+// CHECK1-64-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK1-64-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 2
+// CHECK1-64-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 8
+// CHECK1-64-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 3
+// CHECK1-64-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 8
+// CHECK1-64-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 4
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 8
+// CHECK1-64-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 5
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 8
+// CHECK1-64-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 6
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP135]], align 8
+// CHECK1-64-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 7
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP136]], align 8
+// CHECK1-64-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 8
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK1-64-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 9
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK1-64-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 10
+// CHECK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK1-64-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 11
+// CHECK1-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK1-64-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 12
+// CHECK1-64-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK1-64-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]])
+// CHECK1-64-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK1-64-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED14:%.*]], label [[OMP_OFFLOAD_CONT15:%.*]]
+// CHECK1-64:       omp_offload.failed14:
+// CHECK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT15]]
+// CHECK1-64:       omp_offload.cont15:
+// CHECK1-64-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK1-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK1-64-NEXT:    ret i32 [[TMP144]]
+// CHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK1-64-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// CHECK1-64-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK1-64-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// CHECK1-64-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK1-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// CHECK1-64-NEXT:    ret void
+// CHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK1-64-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// CHECK1-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-64-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// CHECK1-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// CHECK1-64-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// CHECK1-64-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK1-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK1-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK1-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK1-64-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// CHECK1-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// CHECK1-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK1-64-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// CHECK1-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK1-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK1-64-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// CHECK1-64-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// CHECK1-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// CHECK1-64-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK1-64-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// CHECK1-64-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK1-64-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// CHECK1-64-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK1-64-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK1-64-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK1-64-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// CHECK1-64-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK1-64-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// CHECK1-64-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK1-64-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// CHECK1-64-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK1-64-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// CHECK1-64-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK1-64-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// CHECK1-64-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK1-64-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// CHECK1-64-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK1-64-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// CHECK1-64-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK1-64-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK1-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK1-64-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK1-64-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// CHECK1-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK1-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// CHECK1-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK1-64-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// CHECK1-64-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// CHECK1-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK1-64-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// CHECK1-64-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// CHECK1-64-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// CHECK1-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK1-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK1-64-NEXT:    store i64 1, i64* [[X]], align 8
+// CHECK1-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK1-64-NEXT:    store i8 1, i8* [[Y]], align 8
+// CHECK1-64-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK1-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK1-64-NEXT:    ret void
+// CHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK1-64-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK1-64-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// CHECK1-64-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK1-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK1-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 8, i1 false)
+// CHECK1-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK1-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 0
+// CHECK1-64-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK1-64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 0
+// CHECK1-64-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+// CHECK1-64-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK1-64-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 8
+// CHECK1-64-NEXT:    ret void
+// CHECK1-64-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK1-64-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK1-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// CHECK1-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK1-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK1-64-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK1-64-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK1-64-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// CHECK1-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK1-64-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// CHECK1-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK1-64-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    ret i32 [[TMP9]]
+// CHECK1-64-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK1-64-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK1-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK1-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 8
+// CHECK1-64-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK1-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK1-64-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// CHECK1-64-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK1-64-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// CHECK1-64-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_CASTED]] to i32*
+// CHECK1-64-NEXT:    store i32 [[TMP5]], i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[B_CASTED]], align 8
+// CHECK1-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP7:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK1-64-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+// CHECK1-64-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP9]], i8* align 8 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i64 40, i1 false)
+// CHECK1-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK1-64-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 8
+// CHECK1-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK1-64-NEXT:    store double* [[A]], double** [[TMP13]], align 8
+// CHECK1-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP14]], align 8
+// CHECK1-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP6]], i64* [[TMP16]], align 8
+// CHECK1-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP6]], i64* [[TMP18]], align 8
+// CHECK1-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP19]], align 8
+// CHECK1-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-64-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i64*
+// CHECK1-64-NEXT:    store i64 2, i64* [[TMP21]], align 8
+// CHECK1-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-64-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i64*
+// CHECK1-64-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK1-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP24]], align 8
+// CHECK1-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK1-64-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP2]], i64* [[TMP26]], align 8
+// CHECK1-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK1-64-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP2]], i64* [[TMP28]], align 8
+// CHECK1-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP29]], align 8
+// CHECK1-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK1-64-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK1-64-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 8
+// CHECK1-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK1-64-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK1-64-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 8
+// CHECK1-64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK1-64-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 8
+// CHECK1-64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 4
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP35]], align 8
+// CHECK1-64-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-64-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-64-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK1-64-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-64-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK1-64-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-64-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 8
+// CHECK1-64-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-64-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 8
+// CHECK1-64-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-64-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 8
+// CHECK1-64-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 8
+// CHECK1-64-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP45]], align 8
+// CHECK1-64-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP46]], align 8
+// CHECK1-64-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK1-64-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK1-64-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK1-64-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK1-64-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-64-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK1-64-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-64-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK1-64-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1-64:       omp_offload.failed:
+// CHECK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1-64:       omp_offload.cont:
+// CHECK1-64-NEXT:    [[TMP54:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP54]]
+// CHECK1-64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK1-64-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK1-64-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK1-64-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK1-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], [[TMP56]]
+// CHECK1-64-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK1-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK1-64-NEXT:    ret i32 [[ADD4]]
+// CHECK1-64-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK1-64-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK1-64-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK1-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK1-64-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK1-64-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK1-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_CASTED]] to i8*
+// CHECK1-64-NEXT:    store i8 [[TMP2]], i8* [[CONV1]], align 1
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[AAA_CASTED]], align 8
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK1-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[TMP7]], align 8
+// CHECK1-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP8]], align 8
+// CHECK1-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP3]], i64* [[TMP10]], align 8
+// CHECK1-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP3]], i64* [[TMP12]], align 8
+// CHECK1-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP13]], align 8
+// CHECK1-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-64-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK1-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 8
+// CHECK1-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-64-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK1-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 8
+// CHECK1-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP18]], align 8
+// CHECK1-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-64-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK1-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-64-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK1-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-64-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 8
+// CHECK1-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-64-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 8
+// CHECK1-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 8
+// CHECK1-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 8
+// CHECK1-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP27]], align 8
+// CHECK1-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP28]], align 8
+// CHECK1-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK1-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK1-64-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK1-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK1-64-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-64-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK1-64-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-64-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK1-64-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1-64:       omp_offload.failed:
+// CHECK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i64 [[TMP1]], i64 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1-64:       omp_offload.cont:
+// CHECK1-64-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    ret i32 [[TMP36]]
+// CHECK1-64-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK1-64-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-64-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK1-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-64-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-64-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK1-64-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[TMP3]], align 8
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK1-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP6]], align 8
+// CHECK1-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK1-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 8
+// CHECK1-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-64-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK1-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 8
+// CHECK1-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-64-NEXT:    store i8* null, i8** [[TMP11]], align 8
+// CHECK1-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-64-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK1-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-64-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK1-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-64-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 8
+// CHECK1-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-64-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 8
+// CHECK1-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 8
+// CHECK1-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-64-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 8
+// CHECK1-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP20]], align 8
+// CHECK1-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-64-NEXT:    store i8** null, i8*** [[TMP21]], align 8
+// CHECK1-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK1-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-64-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK1-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK1-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-64-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK1-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-64-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK1-64-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-64-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK1-64-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1-64:       omp_offload.failed:
+// CHECK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i64 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1-64:       omp_offload.cont:
+// CHECK1-64-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-64-NEXT:    ret i32 [[TMP29]]
+// CHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK1-64-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK1-64-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// CHECK1-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK1-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK1-64-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// CHECK1-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK1-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK1-64-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK1-64-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK1-64-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// CHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK1-64-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// CHECK1-64-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK1-64-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// CHECK1-64-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK1-64-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// CHECK1-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK1-64-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// CHECK1-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK1-64-NEXT:    store double [[ADD]], double* [[A]], align 8
+// CHECK1-64-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK1-64-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// CHECK1-64-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK1-64-NEXT:    store double [[INC]], double* [[A5]], align 8
+// CHECK1-64-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// CHECK1-64-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// CHECK1-64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK1-64-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// CHECK1-64-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK1-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK1-64-NEXT:    ret void
+// CHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK1-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK1-64-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// CHECK1-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK1-64-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// CHECK1-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK1-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK1-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// CHECK1-64-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK1-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// CHECK1-64-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// CHECK1-64-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// CHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// CHECK1-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK1-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK1-64-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// CHECK1-64-NEXT:    ret void
+// CHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK1-64-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK1-64-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK1-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK1-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK1-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK1-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK1-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK1-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK1-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// CHECK1-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK1-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK1-64-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK1-64-NEXT:    ret void
+// CHECK1-64-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-64-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-64-NEXT:  entry:
+// CHECK1-64-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK1-64-NEXT:    ret void
+// CHECK2-32-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK2-32-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK2-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK2-32-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK2-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK2-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK2-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK2-32-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK2-32-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK2-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[GA_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [9 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [9 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [9 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS8:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_PTRS9:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS10:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK2-32-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// CHECK2-32-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// CHECK2-32-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK2-32-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// CHECK2-32-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK2-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// CHECK2-32-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// CHECK2-32-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK2-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP6]], i32* [[A_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP8:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK2-32-NEXT:    [[TMP9:%.*]] = load i32, i32* @ga, align 4
+// CHECK2-32-NEXT:    store i32 [[TMP9]], i32* [[GA_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[GA_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP7]], i32* [[TMP12]], align 4
+// CHECK2-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP7]], i32* [[TMP14]], align 4
+// CHECK2-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP15]], align 4
+// CHECK2-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to i32**
+// CHECK2-32-NEXT:    store i32* [[TMP8]], i32** [[TMP17]], align 4
+// CHECK2-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK2-32-NEXT:    store i32* [[TMP8]], i32** [[TMP19]], align 4
+// CHECK2-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP20]], align 4
+// CHECK2-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP10]], i32* [[TMP22]], align 4
+// CHECK2-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP10]], i32* [[TMP24]], align 4
+// CHECK2-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP25]], align 4
+// CHECK2-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP28]], align 4
+// CHECK2-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i32 3, i32* [[TMP29]], align 4
+// CHECK2-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8** [[TMP26]], i8*** [[TMP30]], align 4
+// CHECK2-32-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i8** [[TMP27]], i8*** [[TMP31]], align 4
+// CHECK2-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP32]], align 4
+// CHECK2-32-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP33]], align 4
+// CHECK2-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP34]], align 4
+// CHECK2-32-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP35]], align 4
+// CHECK2-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP36]], align 8
+// CHECK2-32-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP37]], align 8
+// CHECK2-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP38]], align 4
+// CHECK2-32-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP39]], align 4
+// CHECK2-32-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-32-NEXT:    store i32 0, i32* [[TMP40]], align 4
+// CHECK2-32-NEXT:    [[TMP41:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK2-32-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
+// CHECK2-32-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2-32:       omp_offload.failed:
+// CHECK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i32 [[TMP7]], i32* [[TMP8]], i32 [[TMP10]]) #[[ATTR3:[0-9]+]]
+// CHECK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK2-32:       omp_offload.cont:
+// CHECK2-32-NEXT:    [[TMP43:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK2-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_CASTED]] to i16*
+// CHECK2-32-NEXT:    store i16 [[TMP43]], i16* [[CONV]], align 2
+// CHECK2-32-NEXT:    [[TMP44:%.*]] = load i32, i32* [[AA_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP45:%.*]] = mul nuw i32 [[TMP0]], 4
+// CHECK2-32-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP45]] to i64
+// CHECK2-32-NEXT:    [[TMP47:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK2-32-NEXT:    [[TMP48:%.*]] = mul nuw i32 [[TMP47]], 8
+// CHECK2-32-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP48]] to i64
+// CHECK2-32-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP50]], i8* align 4 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i32 72, i1 false)
+// CHECK2-32-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP44]], i32* [[TMP52]], align 4
+// CHECK2-32-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP44]], i32* [[TMP54]], align 4
+// CHECK2-32-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP55]], align 4
+// CHECK2-32-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK2-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 4
+// CHECK2-32-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK2-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 4
+// CHECK2-32-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP60]], align 4
+// CHECK2-32-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP0]], i32* [[TMP62]], align 4
+// CHECK2-32-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP0]], i32* [[TMP64]], align 4
+// CHECK2-32-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP65]], align 4
+// CHECK2-32-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 3
+// CHECK2-32-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK2-32-NEXT:    store float* [[VLA]], float** [[TMP67]], align 4
+// CHECK2-32-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 3
+// CHECK2-32-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK2-32-NEXT:    store float* [[VLA]], float** [[TMP69]], align 4
+// CHECK2-32-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i64 [[TMP46]], i64* [[TMP70]], align 4
+// CHECK2-32-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP71]], align 4
+// CHECK2-32-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 4
+// CHECK2-32-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK2-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 4
+// CHECK2-32-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 4
+// CHECK2-32-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK2-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 4
+// CHECK2-32-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP76]], align 4
+// CHECK2-32-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 5
+// CHECK2-32-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i32*
+// CHECK2-32-NEXT:    store i32 5, i32* [[TMP78]], align 4
+// CHECK2-32-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 5
+// CHECK2-32-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i32*
+// CHECK2-32-NEXT:    store i32 5, i32* [[TMP80]], align 4
+// CHECK2-32-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 5
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP81]], align 4
+// CHECK2-32-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 6
+// CHECK2-32-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP2]], i32* [[TMP83]], align 4
+// CHECK2-32-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 6
+// CHECK2-32-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP2]], i32* [[TMP85]], align 4
+// CHECK2-32-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 6
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP86]], align 4
+// CHECK2-32-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 7
+// CHECK2-32-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK2-32-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 4
+// CHECK2-32-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 7
+// CHECK2-32-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK2-32-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 4
+// CHECK2-32-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK2-32-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 4
+// CHECK2-32-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 7
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP92]], align 4
+// CHECK2-32-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 8
+// CHECK2-32-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK2-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 4
+// CHECK2-32-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 8
+// CHECK2-32-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK2-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 4
+// CHECK2-32-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 8
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP97]], align 4
+// CHECK2-32-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK2-32-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK2-32-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK2-32-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 4
+// CHECK2-32-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 4
+// CHECK2-32-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 4
+// CHECK2-32-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 4
+// CHECK2-32-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP107]], align 4
+// CHECK2-32-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP108]], align 4
+// CHECK2-32-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK2-32-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK2-32-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK2-32-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK2-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK2-32-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK2-32-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK2-32-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]])
+// CHECK2-32-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK2-32-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK2-32:       omp_offload.failed6:
+// CHECK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i32 [[TMP44]], [10 x float]* [[B]], i32 [[TMP0]], float* [[VLA]], [5 x [10 x double]]* [[C]], i32 5, i32 [[TMP2]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK2-32:       omp_offload.cont7:
+// CHECK2-32-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK2-32-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 4
+// CHECK2-32-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK2-32-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 4
+// CHECK2-32-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP121]], align 4
+// CHECK2-32-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK2-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 4
+// CHECK2-32-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK2-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 4
+// CHECK2-32-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP126]], align 4
+// CHECK2-32-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[KERNEL_ARGS11:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK2-32-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK2-32-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK2-32-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 4
+// CHECK2-32-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 4
+// CHECK2-32-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 4
+// CHECK2-32-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 5
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 4
+// CHECK2-32-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 6
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP135]], align 4
+// CHECK2-32-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 7
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP136]], align 4
+// CHECK2-32-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 8
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK2-32-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 9
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK2-32-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 10
+// CHECK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK2-32-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 11
+// CHECK2-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK2-32-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 12
+// CHECK2-32-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK2-32-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]])
+// CHECK2-32-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK2-32-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED12:%.*]], label [[OMP_OFFLOAD_CONT13:%.*]]
+// CHECK2-32:       omp_offload.failed12:
+// CHECK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT13]]
+// CHECK2-32:       omp_offload.cont13:
+// CHECK2-32-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK2-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK2-32-NEXT:    ret i32 [[TMP144]]
+// CHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK2-32-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// CHECK2-32-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// CHECK2-32-NEXT:    ret void
+// CHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK2-32-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// CHECK2-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// CHECK2-32-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// CHECK2-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// CHECK2-32-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// CHECK2-32-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK2-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK2-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK2-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK2-32-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// CHECK2-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK2-32-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// CHECK2-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK2-32-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// CHECK2-32-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// CHECK2-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// CHECK2-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// CHECK2-32-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK2-32-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// CHECK2-32-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK2-32-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// CHECK2-32-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK2-32-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// CHECK2-32-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK2-32-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// CHECK2-32-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK2-32-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// CHECK2-32-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK2-32-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// CHECK2-32-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// CHECK2-32-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK2-32-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// CHECK2-32-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK2-32-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// CHECK2-32-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK2-32-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// CHECK2-32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK2-32-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK2-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK2-32-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK2-32-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// CHECK2-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK2-32-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// CHECK2-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK2-32-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// CHECK2-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK2-32-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// CHECK2-32-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// CHECK2-32-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// CHECK2-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK2-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i64 1, i64* [[X]], align 4
+// CHECK2-32-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i8 1, i8* [[Y]], align 4
+// CHECK2-32-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK2-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK2-32-NEXT:    ret void
+// CHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK2-32-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK2-32-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// CHECK2-32-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK2-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK2-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 8, i1 false)
+// CHECK2-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK2-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i32 0
+// CHECK2-32-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK2-32-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i32 0
+// CHECK2-32-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 4
+// CHECK2-32-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK2-32-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 4
+// CHECK2-32-NEXT:    ret void
+// CHECK2-32-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK2-32-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK2-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// CHECK2-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK2-32-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK2-32-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK2-32-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// CHECK2-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK2-32-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// CHECK2-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK2-32-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    ret i32 [[TMP9]]
+// CHECK2-32-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK2-32-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK2-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK2-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 4
+// CHECK2-32-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK2-32-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK2-32-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK2-32-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP4]], i32* [[B_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B_CASTED]], align 4
+// CHECK2-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP6:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK2-32-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK2-32-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+// CHECK2-32-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i32 40, i1 false)
+// CHECK2-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK2-32-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 4
+// CHECK2-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK2-32-NEXT:    store double* [[A]], double** [[TMP13]], align 4
+// CHECK2-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP14]], align 4
+// CHECK2-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP5]], i32* [[TMP16]], align 4
+// CHECK2-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP5]], i32* [[TMP18]], align 4
+// CHECK2-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP19]], align 4
+// CHECK2-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32*
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK2-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i32*
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP23]], align 4
+// CHECK2-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP24]], align 4
+// CHECK2-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK2-32-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[TMP26]], align 4
+// CHECK2-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK2-32-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[TMP28]], align 4
+// CHECK2-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP29]], align 4
+// CHECK2-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK2-32-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK2-32-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 4
+// CHECK2-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK2-32-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK2-32-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 4
+// CHECK2-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 4
+// CHECK2-32-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP35]], align 4
+// CHECK2-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-32-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK2-32-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK2-32-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 4
+// CHECK2-32-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 4
+// CHECK2-32-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 4
+// CHECK2-32-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 4
+// CHECK2-32-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP45]], align 4
+// CHECK2-32-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP46]], align 4
+// CHECK2-32-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK2-32-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK2-32-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK2-32-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK2-32-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-32-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK2-32-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK2-32-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK2-32-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2-32:       omp_offload.failed:
+// CHECK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK2-32:       omp_offload.cont:
+// CHECK2-32-NEXT:    [[TMP54:%.*]] = mul nsw i32 1, [[TMP1]]
+// CHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP54]]
+// CHECK2-32-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK2-32-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK2-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK2-32-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK2-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], [[TMP56]]
+// CHECK2-32-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK2-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK2-32-NEXT:    ret i32 [[ADD3]]
+// CHECK2-32-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK2-32-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK2-32-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK2-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK2-32-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK2-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_CASTED]] to i8*
+// CHECK2-32-NEXT:    store i8 [[TMP2]], i8* [[CONV]], align 1
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[AAA_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK2-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[TMP7]], align 4
+// CHECK2-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP8]], align 4
+// CHECK2-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP3]], i32* [[TMP10]], align 4
+// CHECK2-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP3]], i32* [[TMP12]], align 4
+// CHECK2-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP13]], align 4
+// CHECK2-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK2-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 4
+// CHECK2-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK2-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 4
+// CHECK2-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP18]], align 4
+// CHECK2-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK2-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK2-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 4
+// CHECK2-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 4
+// CHECK2-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 4
+// CHECK2-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 4
+// CHECK2-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP27]], align 4
+// CHECK2-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP28]], align 4
+// CHECK2-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK2-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK2-32-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK2-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK2-32-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-32-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK2-32-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK2-32-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK2-32-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2-32:       omp_offload.failed:
+// CHECK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i32 [[TMP1]], i32 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK2-32:       omp_offload.cont:
+// CHECK2-32-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    ret i32 [[TMP36]]
+// CHECK2-32-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK2-32-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK2-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[TMP3]], align 4
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK2-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP6]], align 4
+// CHECK2-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK2-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 4
+// CHECK2-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-32-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK2-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 4
+// CHECK2-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i8* null, i8** [[TMP11]], align 4
+// CHECK2-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK2-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-32-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK2-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-32-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 4
+// CHECK2-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-32-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 4
+// CHECK2-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 4
+// CHECK2-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 4
+// CHECK2-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP20]], align 4
+// CHECK2-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-32-NEXT:    store i8** null, i8*** [[TMP21]], align 4
+// CHECK2-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK2-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-32-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK2-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK2-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK2-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-32-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK2-32-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK2-32-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK2-32-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2-32:       omp_offload.failed:
+// CHECK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i32 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK2-32:       omp_offload.cont:
+// CHECK2-32-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-32-NEXT:    ret i32 [[TMP29]]
+// CHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK2-32-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK2-32-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// CHECK2-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK2-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK2-32-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK2-32-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK2-32-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// CHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK2-32-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK2-32-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK2-32-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK2-32-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK2-32-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// CHECK2-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK2-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK2-32-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK2-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK2-32-NEXT:    store double [[ADD]], double* [[A]], align 4
+// CHECK2-32-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK2-32-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// CHECK2-32-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK2-32-NEXT:    store double [[INC]], double* [[A4]], align 4
+// CHECK2-32-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// CHECK2-32-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// CHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// CHECK2-32-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK2-32-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// CHECK2-32-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK2-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK2-32-NEXT:    ret void
+// CHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK2-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK2-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK2-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK2-32-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// CHECK2-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK2-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK2-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK2-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK2-32-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK2-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// CHECK2-32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// CHECK2-32-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// CHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK2-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK2-32-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// CHECK2-32-NEXT:    ret void
+// CHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK2-32-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK2-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK2-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK2-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK2-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK2-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK2-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK2-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK2-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK2-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK2-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK2-32-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK2-32-NEXT:    ret void
+// CHECK2-32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK2-32-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK2-32-NEXT:  entry:
+// CHECK2-32-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK2-32-NEXT:    ret void
+// CHECK3-32-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK3-32-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK3-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK3-32-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK3-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK3-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK3-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK3-32-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK3-32-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK3-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[GA_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [9 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [9 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [9 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_BASEPTRS8:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_PTRS9:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_MAPPERS10:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK3-32-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// CHECK3-32-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// CHECK3-32-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK3-32-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// CHECK3-32-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK3-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// CHECK3-32-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// CHECK3-32-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK3-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP6]], i32* [[A_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP8:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK3-32-NEXT:    [[TMP9:%.*]] = load i32, i32* @ga, align 4
+// CHECK3-32-NEXT:    store i32 [[TMP9]], i32* [[GA_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[GA_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP7]], i32* [[TMP12]], align 4
+// CHECK3-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP7]], i32* [[TMP14]], align 4
+// CHECK3-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP15]], align 4
+// CHECK3-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to i32**
+// CHECK3-32-NEXT:    store i32* [[TMP8]], i32** [[TMP17]], align 4
+// CHECK3-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK3-32-NEXT:    store i32* [[TMP8]], i32** [[TMP19]], align 4
+// CHECK3-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP20]], align 4
+// CHECK3-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP10]], i32* [[TMP22]], align 4
+// CHECK3-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP10]], i32* [[TMP24]], align 4
+// CHECK3-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP25]], align 4
+// CHECK3-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP28]], align 4
+// CHECK3-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i32 3, i32* [[TMP29]], align 4
+// CHECK3-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8** [[TMP26]], i8*** [[TMP30]], align 4
+// CHECK3-32-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i8** [[TMP27]], i8*** [[TMP31]], align 4
+// CHECK3-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP32]], align 4
+// CHECK3-32-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP33]], align 4
+// CHECK3-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP34]], align 4
+// CHECK3-32-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP35]], align 4
+// CHECK3-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP36]], align 8
+// CHECK3-32-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP37]], align 8
+// CHECK3-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP38]], align 4
+// CHECK3-32-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP39]], align 4
+// CHECK3-32-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-32-NEXT:    store i32 0, i32* [[TMP40]], align 4
+// CHECK3-32-NEXT:    [[TMP41:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-32-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
+// CHECK3-32-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3-32:       omp_offload.failed:
+// CHECK3-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i32 [[TMP7]], i32* [[TMP8]], i32 [[TMP10]]) #[[ATTR3:[0-9]+]]
+// CHECK3-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3-32:       omp_offload.cont:
+// CHECK3-32-NEXT:    [[TMP43:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK3-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_CASTED]] to i16*
+// CHECK3-32-NEXT:    store i16 [[TMP43]], i16* [[CONV]], align 2
+// CHECK3-32-NEXT:    [[TMP44:%.*]] = load i32, i32* [[AA_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP45:%.*]] = mul nuw i32 [[TMP0]], 4
+// CHECK3-32-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP45]] to i64
+// CHECK3-32-NEXT:    [[TMP47:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK3-32-NEXT:    [[TMP48:%.*]] = mul nuw i32 [[TMP47]], 8
+// CHECK3-32-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP48]] to i64
+// CHECK3-32-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP50]], i8* align 4 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i32 72, i1 false)
+// CHECK3-32-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP44]], i32* [[TMP52]], align 4
+// CHECK3-32-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP44]], i32* [[TMP54]], align 4
+// CHECK3-32-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP55]], align 4
+// CHECK3-32-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK3-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 4
+// CHECK3-32-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK3-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 4
+// CHECK3-32-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP60]], align 4
+// CHECK3-32-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP0]], i32* [[TMP62]], align 4
+// CHECK3-32-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP0]], i32* [[TMP64]], align 4
+// CHECK3-32-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP65]], align 4
+// CHECK3-32-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 3
+// CHECK3-32-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK3-32-NEXT:    store float* [[VLA]], float** [[TMP67]], align 4
+// CHECK3-32-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 3
+// CHECK3-32-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK3-32-NEXT:    store float* [[VLA]], float** [[TMP69]], align 4
+// CHECK3-32-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i64 [[TMP46]], i64* [[TMP70]], align 4
+// CHECK3-32-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP71]], align 4
+// CHECK3-32-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 4
+// CHECK3-32-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK3-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 4
+// CHECK3-32-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 4
+// CHECK3-32-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK3-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 4
+// CHECK3-32-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP76]], align 4
+// CHECK3-32-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 5
+// CHECK3-32-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i32*
+// CHECK3-32-NEXT:    store i32 5, i32* [[TMP78]], align 4
+// CHECK3-32-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 5
+// CHECK3-32-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i32*
+// CHECK3-32-NEXT:    store i32 5, i32* [[TMP80]], align 4
+// CHECK3-32-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 5
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP81]], align 4
+// CHECK3-32-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 6
+// CHECK3-32-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP2]], i32* [[TMP83]], align 4
+// CHECK3-32-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 6
+// CHECK3-32-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP2]], i32* [[TMP85]], align 4
+// CHECK3-32-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 6
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP86]], align 4
+// CHECK3-32-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 7
+// CHECK3-32-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK3-32-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 4
+// CHECK3-32-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 7
+// CHECK3-32-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK3-32-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 4
+// CHECK3-32-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK3-32-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 4
+// CHECK3-32-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 7
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP92]], align 4
+// CHECK3-32-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 8
+// CHECK3-32-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK3-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 4
+// CHECK3-32-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 8
+// CHECK3-32-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK3-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 4
+// CHECK3-32-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 8
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP97]], align 4
+// CHECK3-32-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-32-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK3-32-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK3-32-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 4
+// CHECK3-32-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 4
+// CHECK3-32-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 4
+// CHECK3-32-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 4
+// CHECK3-32-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP107]], align 4
+// CHECK3-32-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP108]], align 4
+// CHECK3-32-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK3-32-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK3-32-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK3-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK3-32-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK3-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK3-32-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK3-32-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK3-32-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]])
+// CHECK3-32-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK3-32-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK3-32:       omp_offload.failed6:
+// CHECK3-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i32 [[TMP44]], [10 x float]* [[B]], i32 [[TMP0]], float* [[VLA]], [5 x [10 x double]]* [[C]], i32 5, i32 [[TMP2]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK3-32-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK3-32:       omp_offload.cont7:
+// CHECK3-32-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK3-32-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 4
+// CHECK3-32-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK3-32-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 4
+// CHECK3-32-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP121]], align 4
+// CHECK3-32-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK3-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 4
+// CHECK3-32-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK3-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 4
+// CHECK3-32-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP126]], align 4
+// CHECK3-32-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[KERNEL_ARGS11:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-32-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK3-32-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK3-32-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 4
+// CHECK3-32-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 4
+// CHECK3-32-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 4
+// CHECK3-32-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 5
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 4
+// CHECK3-32-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 6
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP135]], align 4
+// CHECK3-32-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 7
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP136]], align 4
+// CHECK3-32-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 8
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK3-32-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 9
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK3-32-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 10
+// CHECK3-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK3-32-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 11
+// CHECK3-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK3-32-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 12
+// CHECK3-32-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK3-32-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]])
+// CHECK3-32-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK3-32-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED12:%.*]], label [[OMP_OFFLOAD_CONT13:%.*]]
+// CHECK3-32:       omp_offload.failed12:
+// CHECK3-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK3-32-NEXT:    br label [[OMP_OFFLOAD_CONT13]]
+// CHECK3-32:       omp_offload.cont13:
+// CHECK3-32-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK3-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK3-32-NEXT:    ret i32 [[TMP144]]
+// CHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK3-32-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// CHECK3-32-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// CHECK3-32-NEXT:    ret void
+// CHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK3-32-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// CHECK3-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// CHECK3-32-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// CHECK3-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// CHECK3-32-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// CHECK3-32-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK3-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK3-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK3-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK3-32-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// CHECK3-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK3-32-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// CHECK3-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK3-32-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// CHECK3-32-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// CHECK3-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// CHECK3-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// CHECK3-32-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK3-32-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// CHECK3-32-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK3-32-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// CHECK3-32-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK3-32-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// CHECK3-32-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK3-32-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// CHECK3-32-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK3-32-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// CHECK3-32-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK3-32-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// CHECK3-32-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// CHECK3-32-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK3-32-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// CHECK3-32-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK3-32-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// CHECK3-32-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK3-32-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// CHECK3-32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK3-32-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK3-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK3-32-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK3-32-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// CHECK3-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK3-32-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// CHECK3-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK3-32-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// CHECK3-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK3-32-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// CHECK3-32-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// CHECK3-32-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// CHECK3-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK3-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i64 1, i64* [[X]], align 4
+// CHECK3-32-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i8 1, i8* [[Y]], align 4
+// CHECK3-32-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK3-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK3-32-NEXT:    ret void
+// CHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK3-32-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK3-32-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// CHECK3-32-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK3-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK3-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 8, i1 false)
+// CHECK3-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK3-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i32 0
+// CHECK3-32-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK3-32-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i32 0
+// CHECK3-32-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 4
+// CHECK3-32-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK3-32-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 4
+// CHECK3-32-NEXT:    ret void
+// CHECK3-32-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK3-32-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK3-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// CHECK3-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK3-32-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK3-32-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK3-32-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// CHECK3-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK3-32-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// CHECK3-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK3-32-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    ret i32 [[TMP9]]
+// CHECK3-32-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK3-32-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK3-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK3-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 4
+// CHECK3-32-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK3-32-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK3-32-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK3-32-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP4]], i32* [[B_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B_CASTED]], align 4
+// CHECK3-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP6:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK3-32-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK3-32-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+// CHECK3-32-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i32 40, i1 false)
+// CHECK3-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK3-32-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 4
+// CHECK3-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK3-32-NEXT:    store double* [[A]], double** [[TMP13]], align 4
+// CHECK3-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP14]], align 4
+// CHECK3-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP5]], i32* [[TMP16]], align 4
+// CHECK3-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP5]], i32* [[TMP18]], align 4
+// CHECK3-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP19]], align 4
+// CHECK3-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32*
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK3-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i32*
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP23]], align 4
+// CHECK3-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP24]], align 4
+// CHECK3-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK3-32-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[TMP26]], align 4
+// CHECK3-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK3-32-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[TMP28]], align 4
+// CHECK3-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP29]], align 4
+// CHECK3-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK3-32-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK3-32-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 4
+// CHECK3-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK3-32-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK3-32-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 4
+// CHECK3-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 4
+// CHECK3-32-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP35]], align 4
+// CHECK3-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-32-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK3-32-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK3-32-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 4
+// CHECK3-32-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 4
+// CHECK3-32-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 4
+// CHECK3-32-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 4
+// CHECK3-32-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP45]], align 4
+// CHECK3-32-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP46]], align 4
+// CHECK3-32-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK3-32-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK3-32-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK3-32-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK3-32-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-32-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK3-32-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-32-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK3-32-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3-32:       omp_offload.failed:
+// CHECK3-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK3-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3-32:       omp_offload.cont:
+// CHECK3-32-NEXT:    [[TMP54:%.*]] = mul nsw i32 1, [[TMP1]]
+// CHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP54]]
+// CHECK3-32-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK3-32-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK3-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK3-32-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK3-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], [[TMP56]]
+// CHECK3-32-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK3-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK3-32-NEXT:    ret i32 [[ADD3]]
+// CHECK3-32-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK3-32-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK3-32-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK3-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK3-32-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK3-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_CASTED]] to i8*
+// CHECK3-32-NEXT:    store i8 [[TMP2]], i8* [[CONV]], align 1
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[AAA_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK3-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[TMP7]], align 4
+// CHECK3-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP8]], align 4
+// CHECK3-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP3]], i32* [[TMP10]], align 4
+// CHECK3-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP3]], i32* [[TMP12]], align 4
+// CHECK3-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP13]], align 4
+// CHECK3-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK3-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 4
+// CHECK3-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK3-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 4
+// CHECK3-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP18]], align 4
+// CHECK3-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK3-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK3-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 4
+// CHECK3-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 4
+// CHECK3-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 4
+// CHECK3-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 4
+// CHECK3-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP27]], align 4
+// CHECK3-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP28]], align 4
+// CHECK3-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK3-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK3-32-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK3-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK3-32-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-32-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK3-32-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-32-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK3-32-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3-32:       omp_offload.failed:
+// CHECK3-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i32 [[TMP1]], i32 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK3-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3-32:       omp_offload.cont:
+// CHECK3-32-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    ret i32 [[TMP36]]
+// CHECK3-32-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK3-32-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK3-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-32-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[TMP3]], align 4
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK3-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP6]], align 4
+// CHECK3-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK3-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 4
+// CHECK3-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-32-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK3-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 4
+// CHECK3-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i8* null, i8** [[TMP11]], align 4
+// CHECK3-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK3-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-32-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK3-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-32-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 4
+// CHECK3-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-32-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 4
+// CHECK3-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 4
+// CHECK3-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-32-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 4
+// CHECK3-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP20]], align 4
+// CHECK3-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-32-NEXT:    store i8** null, i8*** [[TMP21]], align 4
+// CHECK3-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK3-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-32-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK3-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK3-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-32-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK3-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-32-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK3-32-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-32-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK3-32-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3-32:       omp_offload.failed:
+// CHECK3-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i32 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK3-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3-32:       omp_offload.cont:
+// CHECK3-32-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-32-NEXT:    ret i32 [[TMP29]]
+// CHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK3-32-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK3-32-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// CHECK3-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK3-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK3-32-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK3-32-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK3-32-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// CHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK3-32-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK3-32-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK3-32-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK3-32-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK3-32-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// CHECK3-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK3-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK3-32-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK3-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK3-32-NEXT:    store double [[ADD]], double* [[A]], align 4
+// CHECK3-32-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK3-32-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// CHECK3-32-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK3-32-NEXT:    store double [[INC]], double* [[A4]], align 4
+// CHECK3-32-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// CHECK3-32-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// CHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// CHECK3-32-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK3-32-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// CHECK3-32-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK3-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK3-32-NEXT:    ret void
+// CHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK3-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK3-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK3-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK3-32-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// CHECK3-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK3-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK3-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK3-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK3-32-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK3-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// CHECK3-32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// CHECK3-32-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// CHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK3-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK3-32-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// CHECK3-32-NEXT:    ret void
+// CHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK3-32-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK3-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK3-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK3-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK3-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK3-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK3-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK3-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK3-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK3-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK3-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK3-32-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK3-32-NEXT:    ret void
+// CHECK3-32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-32-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-32-NEXT:  entry:
+// CHECK3-32-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK3-32-NEXT:    ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// TCHECK-64-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK-64-NEXT:  entry:
+// TCHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK-64-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// TCHECK-64-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// TCHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// TCHECK-64-NEXT:    ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// TCHECK-64-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT:  entry:
+// TCHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// TCHECK-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// TCHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// TCHECK-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// TCHECK-64-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// TCHECK-64-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// TCHECK-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// TCHECK-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// TCHECK-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// TCHECK-64-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// TCHECK-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// TCHECK-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// TCHECK-64-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// TCHECK-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// TCHECK-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// TCHECK-64-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// TCHECK-64-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// TCHECK-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// TCHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// TCHECK-64-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// TCHECK-64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// TCHECK-64-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// TCHECK-64-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// TCHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// TCHECK-64-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// TCHECK-64-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// TCHECK-64-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// TCHECK-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// TCHECK-64-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// TCHECK-64-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// TCHECK-64-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// TCHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// TCHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// TCHECK-64-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// TCHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// TCHECK-64-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// TCHECK-64-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// TCHECK-64-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// TCHECK-64-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// TCHECK-64-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// TCHECK-64-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// TCHECK-64-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// TCHECK-64-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// TCHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// TCHECK-64-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// TCHECK-64-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// TCHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// TCHECK-64-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// TCHECK-64-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// TCHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// TCHECK-64-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// TCHECK-64-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// TCHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// TCHECK-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// TCHECK-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// TCHECK-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// TCHECK-64-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// TCHECK-64-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// TCHECK-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// TCHECK-64-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// TCHECK-64-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// TCHECK-64-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// TCHECK-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// TCHECK-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// TCHECK-64-NEXT:    store i64 1, i64* [[X]], align 8
+// TCHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// TCHECK-64-NEXT:    store i8 1, i8* [[Y]], align 8
+// TCHECK-64-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// TCHECK-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// TCHECK-64-NEXT:    ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// TCHECK-64-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT:  entry:
+// TCHECK-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// TCHECK-64-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// TCHECK-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// TCHECK-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// TCHECK-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0:%.*]], %struct.TT.0* [[TMP0]], i32 0, i32 0
+// TCHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X]], align 4
+// TCHECK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// TCHECK-64-NEXT:    [[TMP2:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// TCHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 0
+// TCHECK-64-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// TCHECK-64-NEXT:    [[TMP3:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// TCHECK-64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP3]], i64 0
+// TCHECK-64-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX1]], align 8
+// TCHECK-64-NEXT:    [[INC:%.*]] = fadd double [[TMP4]], 1.000000e+00
+// TCHECK-64-NEXT:    store double [[INC]], double* [[ARRAYIDX1]], align 8
+// TCHECK-64-NEXT:    ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// TCHECK-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT:  entry:
+// TCHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// TCHECK-64-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// TCHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK-64-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// TCHECK-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// TCHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// TCHECK-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// TCHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// TCHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// TCHECK-64-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// TCHECK-64-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// TCHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// TCHECK-64-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// TCHECK-64-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// TCHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// TCHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// TCHECK-64-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// TCHECK-64-NEXT:    ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// TCHECK-64-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT:  entry:
+// TCHECK-64-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// TCHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// TCHECK-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// TCHECK-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// TCHECK-64-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// TCHECK-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// TCHECK-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// TCHECK-64-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// TCHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// TCHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// TCHECK-64-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// TCHECK-64-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// TCHECK-64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// TCHECK-64-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// TCHECK-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// TCHECK-64-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// TCHECK-64-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// TCHECK-64-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// TCHECK-64-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// TCHECK-64-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// TCHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// TCHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-64-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// TCHECK-64-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// TCHECK-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK-64-NEXT:    store double [[ADD]], double* [[A]], align 8
+// TCHECK-64-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK-64-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// TCHECK-64-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// TCHECK-64-NEXT:    store double [[INC]], double* [[A5]], align 8
+// TCHECK-64-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// TCHECK-64-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// TCHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// TCHECK-64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// TCHECK-64-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// TCHECK-64-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// TCHECK-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// TCHECK-64-NEXT:    ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// TCHECK-64-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT:  entry:
+// TCHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// TCHECK-64-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// TCHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// TCHECK-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// TCHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// TCHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// TCHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// TCHECK-64-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// TCHECK-64-NEXT:    ret void
+// TCHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// TCHECK1-64-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK1-64-NEXT:  entry:
+// TCHECK1-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// TCHECK1-64-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK1-64-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// TCHECK1-64-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK1-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// TCHECK1-64-NEXT:    ret void
+// TCHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// TCHECK1-64-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK1-64-NEXT:  entry:
+// TCHECK1-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// TCHECK1-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// TCHECK1-64-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// TCHECK1-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// TCHECK1-64-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// TCHECK1-64-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// TCHECK1-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// TCHECK1-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// TCHECK1-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// TCHECK1-64-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// TCHECK1-64-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// TCHECK1-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// TCHECK1-64-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// TCHECK1-64-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// TCHECK1-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// TCHECK1-64-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// TCHECK1-64-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// TCHECK1-64-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// TCHECK1-64-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// TCHECK1-64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// TCHECK1-64-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// TCHECK1-64-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// TCHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// TCHECK1-64-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// TCHECK1-64-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// TCHECK1-64-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// TCHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// TCHECK1-64-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// TCHECK1-64-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// TCHECK1-64-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// TCHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// TCHECK1-64-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// TCHECK1-64-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// TCHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// TCHECK1-64-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// TCHECK1-64-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// TCHECK1-64-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// TCHECK1-64-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// TCHECK1-64-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// TCHECK1-64-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// TCHECK1-64-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// TCHECK1-64-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// TCHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// TCHECK1-64-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// TCHECK1-64-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// TCHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// TCHECK1-64-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// TCHECK1-64-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// TCHECK1-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// TCHECK1-64-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// TCHECK1-64-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// TCHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// TCHECK1-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// TCHECK1-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// TCHECK1-64-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// TCHECK1-64-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// TCHECK1-64-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// TCHECK1-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// TCHECK1-64-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// TCHECK1-64-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// TCHECK1-64-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// TCHECK1-64-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// TCHECK1-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// TCHECK1-64-NEXT:    store i64 1, i64* [[X]], align 8
+// TCHECK1-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// TCHECK1-64-NEXT:    store i8 1, i8* [[Y]], align 8
+// TCHECK1-64-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// TCHECK1-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// TCHECK1-64-NEXT:    ret void
+// TCHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// TCHECK1-64-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK1-64-NEXT:  entry:
+// TCHECK1-64-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// TCHECK1-64-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// TCHECK1-64-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// TCHECK1-64-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0:%.*]], %struct.TT.0* [[TMP0]], i32 0, i32 0
+// TCHECK1-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X]], align 4
+// TCHECK1-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// TCHECK1-64-NEXT:    [[TMP2:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 0
+// TCHECK1-64-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// TCHECK1-64-NEXT:    [[TMP3:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP3]], i64 0
+// TCHECK1-64-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX1]], align 8
+// TCHECK1-64-NEXT:    [[INC:%.*]] = fadd double [[TMP4]], 1.000000e+00
+// TCHECK1-64-NEXT:    store double [[INC]], double* [[ARRAYIDX1]], align 8
+// TCHECK1-64-NEXT:    ret void
+// TCHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// TCHECK1-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK1-64-NEXT:  entry:
+// TCHECK1-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// TCHECK1-64-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// TCHECK1-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK1-64-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// TCHECK1-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK1-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// TCHECK1-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// TCHECK1-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// TCHECK1-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK1-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK1-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// TCHECK1-64-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// TCHECK1-64-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// TCHECK1-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// TCHECK1-64-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// TCHECK1-64-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// TCHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// TCHECK1-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK1-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// TCHECK1-64-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// TCHECK1-64-NEXT:    ret void
+// TCHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// TCHECK1-64-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK1-64-NEXT:  entry:
+// TCHECK1-64-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// TCHECK1-64-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// TCHECK1-64-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// TCHECK1-64-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// TCHECK1-64-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// TCHECK1-64-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// TCHECK1-64-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// TCHECK1-64-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// TCHECK1-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// TCHECK1-64-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// TCHECK1-64-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// TCHECK1-64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// TCHECK1-64-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// TCHECK1-64-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// TCHECK1-64-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// TCHECK1-64-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// TCHECK1-64-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// TCHECK1-64-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// TCHECK1-64-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// TCHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// TCHECK1-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK1-64-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// TCHECK1-64-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// TCHECK1-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK1-64-NEXT:    store double [[ADD]], double* [[A]], align 8
+// TCHECK1-64-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK1-64-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// TCHECK1-64-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// TCHECK1-64-NEXT:    store double [[INC]], double* [[A5]], align 8
+// TCHECK1-64-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// TCHECK1-64-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// TCHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// TCHECK1-64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// TCHECK1-64-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// TCHECK1-64-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// TCHECK1-64-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// TCHECK1-64-NEXT:    ret void
+// TCHECK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// TCHECK1-64-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK1-64-NEXT:  entry:
+// TCHECK1-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-64-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// TCHECK1-64-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK1-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK1-64-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK1-64-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// TCHECK1-64-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK1-64-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK1-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// TCHECK1-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK1-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK1-64-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// TCHECK1-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// TCHECK1-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK1-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// TCHECK1-64-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// TCHECK1-64-NEXT:    ret void
+// TCHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// TCHECK2-32-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK2-32-NEXT:  entry:
+// TCHECK2-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// TCHECK2-32-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK2-32-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// TCHECK2-32-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// TCHECK2-32-NEXT:    ret void
+// TCHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// TCHECK2-32-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK2-32-NEXT:  entry:
+// TCHECK2-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// TCHECK2-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// TCHECK2-32-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// TCHECK2-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// TCHECK2-32-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// TCHECK2-32-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// TCHECK2-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// TCHECK2-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// TCHECK2-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// TCHECK2-32-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// TCHECK2-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// TCHECK2-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// TCHECK2-32-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// TCHECK2-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// TCHECK2-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// TCHECK2-32-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// TCHECK2-32-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// TCHECK2-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// TCHECK2-32-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// TCHECK2-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// TCHECK2-32-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// TCHECK2-32-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// TCHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// TCHECK2-32-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// TCHECK2-32-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// TCHECK2-32-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// TCHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// TCHECK2-32-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// TCHECK2-32-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// TCHECK2-32-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// TCHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// TCHECK2-32-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// TCHECK2-32-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// TCHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// TCHECK2-32-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// TCHECK2-32-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// TCHECK2-32-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// TCHECK2-32-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// TCHECK2-32-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// TCHECK2-32-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// TCHECK2-32-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// TCHECK2-32-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// TCHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// TCHECK2-32-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// TCHECK2-32-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// TCHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// TCHECK2-32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// TCHECK2-32-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// TCHECK2-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// TCHECK2-32-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// TCHECK2-32-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// TCHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// TCHECK2-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// TCHECK2-32-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// TCHECK2-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// TCHECK2-32-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// TCHECK2-32-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// TCHECK2-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// TCHECK2-32-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// TCHECK2-32-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// TCHECK2-32-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// TCHECK2-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// TCHECK2-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// TCHECK2-32-NEXT:    store i64 1, i64* [[X]], align 4
+// TCHECK2-32-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// TCHECK2-32-NEXT:    store i8 1, i8* [[Y]], align 4
+// TCHECK2-32-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// TCHECK2-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// TCHECK2-32-NEXT:    ret void
+// TCHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// TCHECK2-32-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK2-32-NEXT:  entry:
+// TCHECK2-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// TCHECK2-32-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// TCHECK2-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// TCHECK2-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0:%.*]], %struct.TT.0* [[TMP0]], i32 0, i32 0
+// TCHECK2-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X]], align 4
+// TCHECK2-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// TCHECK2-32-NEXT:    [[TMP2:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
+// TCHECK2-32-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// TCHECK2-32-NEXT:    [[TMP3:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
+// TCHECK2-32-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX1]], align 4
+// TCHECK2-32-NEXT:    [[INC:%.*]] = fadd double [[TMP4]], 1.000000e+00
+// TCHECK2-32-NEXT:    store double [[INC]], double* [[ARRAYIDX1]], align 4
+// TCHECK2-32-NEXT:    ret void
+// TCHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// TCHECK2-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK2-32-NEXT:  entry:
+// TCHECK2-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// TCHECK2-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK2-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK2-32-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// TCHECK2-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// TCHECK2-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK2-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// TCHECK2-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK2-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// TCHECK2-32-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// TCHECK2-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// TCHECK2-32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// TCHECK2-32-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// TCHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// TCHECK2-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK2-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// TCHECK2-32-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// TCHECK2-32-NEXT:    ret void
+// TCHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// TCHECK2-32-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK2-32-NEXT:  entry:
+// TCHECK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// TCHECK2-32-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// TCHECK2-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// TCHECK2-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// TCHECK2-32-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// TCHECK2-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// TCHECK2-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// TCHECK2-32-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// TCHECK2-32-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// TCHECK2-32-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// TCHECK2-32-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// TCHECK2-32-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// TCHECK2-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// TCHECK2-32-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// TCHECK2-32-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// TCHECK2-32-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// TCHECK2-32-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// TCHECK2-32-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// TCHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// TCHECK2-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// TCHECK2-32-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// TCHECK2-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK2-32-NEXT:    store double [[ADD]], double* [[A]], align 4
+// TCHECK2-32-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK2-32-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// TCHECK2-32-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// TCHECK2-32-NEXT:    store double [[INC]], double* [[A4]], align 4
+// TCHECK2-32-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// TCHECK2-32-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// TCHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// TCHECK2-32-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// TCHECK2-32-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// TCHECK2-32-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// TCHECK2-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// TCHECK2-32-NEXT:    ret void
+// TCHECK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// TCHECK2-32-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK2-32-NEXT:  entry:
+// TCHECK2-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// TCHECK2-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK2-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK2-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK2-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK2-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// TCHECK2-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK2-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// TCHECK2-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// TCHECK2-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK2-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// TCHECK2-32-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// TCHECK2-32-NEXT:    ret void
+// TCHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// TCHECK3-32-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK3-32-NEXT:  entry:
+// TCHECK3-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// TCHECK3-32-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK3-32-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// TCHECK3-32-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// TCHECK3-32-NEXT:    ret void
+// TCHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// TCHECK3-32-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK3-32-NEXT:  entry:
+// TCHECK3-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// TCHECK3-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// TCHECK3-32-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// TCHECK3-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// TCHECK3-32-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// TCHECK3-32-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// TCHECK3-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// TCHECK3-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// TCHECK3-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// TCHECK3-32-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// TCHECK3-32-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// TCHECK3-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// TCHECK3-32-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// TCHECK3-32-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// TCHECK3-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// TCHECK3-32-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// TCHECK3-32-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// TCHECK3-32-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// TCHECK3-32-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// TCHECK3-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// TCHECK3-32-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// TCHECK3-32-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// TCHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// TCHECK3-32-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// TCHECK3-32-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// TCHECK3-32-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// TCHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// TCHECK3-32-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// TCHECK3-32-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// TCHECK3-32-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// TCHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// TCHECK3-32-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// TCHECK3-32-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// TCHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// TCHECK3-32-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// TCHECK3-32-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// TCHECK3-32-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// TCHECK3-32-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// TCHECK3-32-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// TCHECK3-32-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// TCHECK3-32-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// TCHECK3-32-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// TCHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// TCHECK3-32-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// TCHECK3-32-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// TCHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// TCHECK3-32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// TCHECK3-32-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// TCHECK3-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// TCHECK3-32-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// TCHECK3-32-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// TCHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// TCHECK3-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// TCHECK3-32-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// TCHECK3-32-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// TCHECK3-32-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// TCHECK3-32-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// TCHECK3-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// TCHECK3-32-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// TCHECK3-32-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// TCHECK3-32-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// TCHECK3-32-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// TCHECK3-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// TCHECK3-32-NEXT:    store i64 1, i64* [[X]], align 4
+// TCHECK3-32-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// TCHECK3-32-NEXT:    store i8 1, i8* [[Y]], align 4
+// TCHECK3-32-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// TCHECK3-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// TCHECK3-32-NEXT:    ret void
+// TCHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// TCHECK3-32-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK3-32-NEXT:  entry:
+// TCHECK3-32-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// TCHECK3-32-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// TCHECK3-32-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// TCHECK3-32-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0:%.*]], %struct.TT.0* [[TMP0]], i32 0, i32 0
+// TCHECK3-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X]], align 4
+// TCHECK3-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// TCHECK3-32-NEXT:    [[TMP2:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
+// TCHECK3-32-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// TCHECK3-32-NEXT:    [[TMP3:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
+// TCHECK3-32-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX1]], align 4
+// TCHECK3-32-NEXT:    [[INC:%.*]] = fadd double [[TMP4]], 1.000000e+00
+// TCHECK3-32-NEXT:    store double [[INC]], double* [[ARRAYIDX1]], align 4
+// TCHECK3-32-NEXT:    ret void
+// TCHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// TCHECK3-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK3-32-NEXT:  entry:
+// TCHECK3-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// TCHECK3-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK3-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK3-32-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// TCHECK3-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// TCHECK3-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK3-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// TCHECK3-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK3-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// TCHECK3-32-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// TCHECK3-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// TCHECK3-32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// TCHECK3-32-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// TCHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// TCHECK3-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK3-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// TCHECK3-32-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// TCHECK3-32-NEXT:    ret void
+// TCHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// TCHECK3-32-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK3-32-NEXT:  entry:
+// TCHECK3-32-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// TCHECK3-32-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// TCHECK3-32-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// TCHECK3-32-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// TCHECK3-32-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// TCHECK3-32-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// TCHECK3-32-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// TCHECK3-32-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// TCHECK3-32-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// TCHECK3-32-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// TCHECK3-32-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// TCHECK3-32-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// TCHECK3-32-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// TCHECK3-32-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// TCHECK3-32-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// TCHECK3-32-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// TCHECK3-32-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// TCHECK3-32-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// TCHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// TCHECK3-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// TCHECK3-32-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// TCHECK3-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK3-32-NEXT:    store double [[ADD]], double* [[A]], align 4
+// TCHECK3-32-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK3-32-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// TCHECK3-32-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// TCHECK3-32-NEXT:    store double [[INC]], double* [[A4]], align 4
+// TCHECK3-32-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// TCHECK3-32-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// TCHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// TCHECK3-32-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// TCHECK3-32-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// TCHECK3-32-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// TCHECK3-32-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// TCHECK3-32-NEXT:    ret void
+// TCHECK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// TCHECK3-32-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK3-32-NEXT:  entry:
+// TCHECK3-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-32-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// TCHECK3-32-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK3-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK3-32-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK3-32-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK3-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// TCHECK3-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK3-32-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// TCHECK3-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// TCHECK3-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK3-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// TCHECK3-32-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// TCHECK3-32-NEXT:    ret void
+// CHECK0-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK0-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK0-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK0-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK0-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK0-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK0-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK0-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK0-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[GA_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK0-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [9 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_PTRS5:%.*]] = alloca [9 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_MAPPERS6:%.*]] = alloca [9 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 8
+// CHECK0-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-NEXT:    [[KERNEL_ARGS13:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK0-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK0-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK0-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK0-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// CHECK0-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK0-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK0-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK0-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// CHECK0-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK0-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// CHECK0-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// CHECK0-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK0-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK0-NEXT:    store i32 [[TMP8]], i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[TMP9:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK0-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK0-NEXT:    [[TMP11:%.*]] = load i32, i32* @ga, align 4
+// CHECK0-NEXT:    [[CONV2:%.*]] = bitcast i64* [[GA_CASTED]] to i32*
+// CHECK0-NEXT:    store i32 [[TMP11]], i32* [[CONV2]], align 4
+// CHECK0-NEXT:    [[TMP12:%.*]] = load i64, i64* [[GA_CASTED]], align 8
+// CHECK0-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP9]], i64* [[TMP14]], align 8
+// CHECK0-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP9]], i64* [[TMP16]], align 8
+// CHECK0-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK0-NEXT:    store i8* null, i8** [[TMP17]], align 8
+// CHECK0-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK0-NEXT:    store i32* [[TMP10]], i32** [[TMP19]], align 8
+// CHECK0-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32**
+// CHECK0-NEXT:    store i32* [[TMP10]], i32** [[TMP21]], align 8
+// CHECK0-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK0-NEXT:    store i8* null, i8** [[TMP22]], align 8
+// CHECK0-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK0-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP12]], i64* [[TMP24]], align 8
+// CHECK0-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK0-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP12]], i64* [[TMP26]], align 8
+// CHECK0-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK0-NEXT:    store i8* null, i8** [[TMP27]], align 8
+// CHECK0-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK0-NEXT:    store i32 2, i32* [[TMP30]], align 4
+// CHECK0-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK0-NEXT:    store i32 3, i32* [[TMP31]], align 4
+// CHECK0-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK0-NEXT:    store i8** [[TMP28]], i8*** [[TMP32]], align 8
+// CHECK0-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK0-NEXT:    store i8** [[TMP29]], i8*** [[TMP33]], align 8
+// CHECK0-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP34]], align 8
+// CHECK0-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP35]], align 8
+// CHECK0-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP36]], align 8
+// CHECK0-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP37]], align 8
+// CHECK0-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK0-NEXT:    store i64 0, i64* [[TMP38]], align 8
+// CHECK0-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK0-NEXT:    store i64 0, i64* [[TMP39]], align 8
+// CHECK0-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK0-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP40]], align 4
+// CHECK0-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK0-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP41]], align 4
+// CHECK0-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK0-NEXT:    store i32 0, i32* [[TMP42]], align 4
+// CHECK0-NEXT:    [[TMP43:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK0-NEXT:    [[TMP44:%.*]] = icmp ne i32 [[TMP43]], 0
+// CHECK0-NEXT:    br i1 [[TMP44]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK0:       omp_offload.failed:
+// CHECK0-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i64 [[TMP9]], i32* [[TMP10]], i64 [[TMP12]]) #[[ATTR3:[0-9]+]]
+// CHECK0-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK0:       omp_offload.cont:
+// CHECK0-NEXT:    [[TMP45:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK0-NEXT:    [[CONV3:%.*]] = bitcast i64* [[AA_CASTED]] to i16*
+// CHECK0-NEXT:    store i16 [[TMP45]], i16* [[CONV3]], align 2
+// CHECK0-NEXT:    [[TMP46:%.*]] = load i64, i64* [[AA_CASTED]], align 8
+// CHECK0-NEXT:    [[TMP47:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK0-NEXT:    [[TMP48:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK0-NEXT:    [[TMP49:%.*]] = mul nuw i64 [[TMP48]], 8
+// CHECK0-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP50]], i8* align 8 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i64 72, i1 false)
+// CHECK0-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP46]], i64* [[TMP52]], align 8
+// CHECK0-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP46]], i64* [[TMP54]], align 8
+// CHECK0-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 0
+// CHECK0-NEXT:    store i8* null, i8** [[TMP55]], align 8
+// CHECK0-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK0-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 8
+// CHECK0-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK0-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 8
+// CHECK0-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 1
+// CHECK0-NEXT:    store i8* null, i8** [[TMP60]], align 8
+// CHECK0-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 2
+// CHECK0-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[TMP62]], align 8
+// CHECK0-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 2
+// CHECK0-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[TMP64]], align 8
+// CHECK0-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 2
+// CHECK0-NEXT:    store i8* null, i8** [[TMP65]], align 8
+// CHECK0-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 3
+// CHECK0-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK0-NEXT:    store float* [[VLA]], float** [[TMP67]], align 8
+// CHECK0-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 3
+// CHECK0-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK0-NEXT:    store float* [[VLA]], float** [[TMP69]], align 8
+// CHECK0-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK0-NEXT:    store i64 [[TMP47]], i64* [[TMP70]], align 8
+// CHECK0-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 3
+// CHECK0-NEXT:    store i8* null, i8** [[TMP71]], align 8
+// CHECK0-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 4
+// CHECK0-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK0-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 8
+// CHECK0-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 4
+// CHECK0-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK0-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 8
+// CHECK0-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 4
+// CHECK0-NEXT:    store i8* null, i8** [[TMP76]], align 8
+// CHECK0-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 5
+// CHECK0-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i64*
+// CHECK0-NEXT:    store i64 5, i64* [[TMP78]], align 8
+// CHECK0-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 5
+// CHECK0-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i64*
+// CHECK0-NEXT:    store i64 5, i64* [[TMP80]], align 8
+// CHECK0-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 5
+// CHECK0-NEXT:    store i8* null, i8** [[TMP81]], align 8
+// CHECK0-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 6
+// CHECK0-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP4]], i64* [[TMP83]], align 8
+// CHECK0-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 6
+// CHECK0-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP4]], i64* [[TMP85]], align 8
+// CHECK0-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 6
+// CHECK0-NEXT:    store i8* null, i8** [[TMP86]], align 8
+// CHECK0-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 7
+// CHECK0-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK0-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 8
+// CHECK0-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 7
+// CHECK0-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK0-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 8
+// CHECK0-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK0-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 8
+// CHECK0-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 7
+// CHECK0-NEXT:    store i8* null, i8** [[TMP92]], align 8
+// CHECK0-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 8
+// CHECK0-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK0-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 8
+// CHECK0-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 8
+// CHECK0-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK0-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 8
+// CHECK0-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 8
+// CHECK0-NEXT:    store i8* null, i8** [[TMP97]], align 8
+// CHECK0-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 0
+// CHECK0-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK0-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 1
+// CHECK0-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK0-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 2
+// CHECK0-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 8
+// CHECK0-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 3
+// CHECK0-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 8
+// CHECK0-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 4
+// CHECK0-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 8
+// CHECK0-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 5
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 8
+// CHECK0-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 6
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP107]], align 8
+// CHECK0-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 7
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP108]], align 8
+// CHECK0-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 8
+// CHECK0-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK0-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 9
+// CHECK0-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK0-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 10
+// CHECK0-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK0-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 11
+// CHECK0-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK0-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 12
+// CHECK0-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK0-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]])
+// CHECK0-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK0-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
+// CHECK0:       omp_offload.failed8:
+// CHECK0-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i64 [[TMP46]], [10 x float]* [[B]], i64 [[TMP1]], float* [[VLA]], [5 x [10 x double]]* [[C]], i64 5, i64 [[TMP4]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK0-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
+// CHECK0:       omp_offload.cont9:
+// CHECK0-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK0-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 8
+// CHECK0-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK0-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 8
+// CHECK0-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 0
+// CHECK0-NEXT:    store i8* null, i8** [[TMP121]], align 8
+// CHECK0-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK0-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 8
+// CHECK0-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK0-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 8
+// CHECK0-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 1
+// CHECK0-NEXT:    store i8* null, i8** [[TMP126]], align 8
+// CHECK0-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 0
+// CHECK0-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK0-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 1
+// CHECK0-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK0-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 2
+// CHECK0-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 8
+// CHECK0-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 3
+// CHECK0-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 8
+// CHECK0-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 4
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 8
+// CHECK0-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 5
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 8
+// CHECK0-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 6
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP135]], align 8
+// CHECK0-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 7
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP136]], align 8
+// CHECK0-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 8
+// CHECK0-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK0-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 9
+// CHECK0-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK0-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 10
+// CHECK0-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK0-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 11
+// CHECK0-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK0-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 12
+// CHECK0-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK0-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]])
+// CHECK0-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK0-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED14:%.*]], label [[OMP_OFFLOAD_CONT15:%.*]]
+// CHECK0:       omp_offload.failed14:
+// CHECK0-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK0-NEXT:    br label [[OMP_OFFLOAD_CONT15]]
+// CHECK0:       omp_offload.cont15:
+// CHECK0-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK0-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK0-NEXT:    ret i32 [[TMP144]]
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK0-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// CHECK0-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK0-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// CHECK0-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK0-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// CHECK0-NEXT:    ret void
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK0-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// CHECK0-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// CHECK0-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// CHECK0-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// CHECK0-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// CHECK0-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK0-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK0-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK0-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK0-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// CHECK0-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// CHECK0-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK0-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// CHECK0-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK0-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK0-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// CHECK0-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// CHECK0-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// CHECK0-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK0-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// CHECK0-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK0-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// CHECK0-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK0-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// CHECK0-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK0-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK0-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK0-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// CHECK0-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK0-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// CHECK0-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK0-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// CHECK0-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK0-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// CHECK0-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK0-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// CHECK0-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK0-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// CHECK0-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK0-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// CHECK0-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK0-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK0-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK0-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK0-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// CHECK0-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK0-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// CHECK0-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK0-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// CHECK0-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// CHECK0-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK0-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// CHECK0-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// CHECK0-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// CHECK0-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK0-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK0-NEXT:    store i64 1, i64* [[X]], align 8
+// CHECK0-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK0-NEXT:    store i8 1, i8* [[Y]], align 8
+// CHECK0-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK0-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK0-NEXT:    ret void
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK0-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK0-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// CHECK0-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK0-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK0-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK0-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 8, i1 false)
+// CHECK0-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK0-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK0-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 0
+// CHECK0-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// CHECK0-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK0-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 0
+// CHECK0-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+// CHECK0-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK0-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 8
+// CHECK0-NEXT:    ret void
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK0-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// CHECK0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK0-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK0-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// CHECK0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK0-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK0-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// CHECK0-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK0-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// CHECK0-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK0-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK0-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// CHECK0-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK0-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK0-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    ret i32 [[TMP9]]
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK0-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK0-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 8
+// CHECK0-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK0-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK0-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK0-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// CHECK0-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// CHECK0-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK0-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// CHECK0-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// CHECK0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_CASTED]] to i32*
+// CHECK0-NEXT:    store i32 [[TMP5]], i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[TMP6:%.*]] = load i64, i64* [[B_CASTED]], align 8
+// CHECK0-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP7:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK0-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+// CHECK0-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP9]], i8* align 8 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i64 40, i1 false)
+// CHECK0-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK0-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 8
+// CHECK0-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK0-NEXT:    store double* [[A]], double** [[TMP13]], align 8
+// CHECK0-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK0-NEXT:    store i8* null, i8** [[TMP14]], align 8
+// CHECK0-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP6]], i64* [[TMP16]], align 8
+// CHECK0-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP6]], i64* [[TMP18]], align 8
+// CHECK0-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK0-NEXT:    store i8* null, i8** [[TMP19]], align 8
+// CHECK0-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK0-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i64*
+// CHECK0-NEXT:    store i64 2, i64* [[TMP21]], align 8
+// CHECK0-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK0-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i64*
+// CHECK0-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK0-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK0-NEXT:    store i8* null, i8** [[TMP24]], align 8
+// CHECK0-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK0-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP2]], i64* [[TMP26]], align 8
+// CHECK0-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK0-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP2]], i64* [[TMP28]], align 8
+// CHECK0-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK0-NEXT:    store i8* null, i8** [[TMP29]], align 8
+// CHECK0-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK0-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK0-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 8
+// CHECK0-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK0-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK0-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 8
+// CHECK0-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK0-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 8
+// CHECK0-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 4
+// CHECK0-NEXT:    store i8* null, i8** [[TMP35]], align 8
+// CHECK0-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK0-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK0-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK0-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK0-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK0-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 8
+// CHECK0-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK0-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 8
+// CHECK0-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK0-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 8
+// CHECK0-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 8
+// CHECK0-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP45]], align 8
+// CHECK0-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP46]], align 8
+// CHECK0-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK0-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK0-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK0-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK0-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK0-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK0-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK0-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK0-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK0-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK0-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK0-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK0-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK0:       omp_offload.failed:
+// CHECK0-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK0-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK0:       omp_offload.cont:
+// CHECK0-NEXT:    [[TMP54:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP54]]
+// CHECK0-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK0-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK0-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK0-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK0-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], [[TMP56]]
+// CHECK0-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK0-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK0-NEXT:    ret i32 [[ADD4]]
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK0-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK0-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK0-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK0-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK0-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK0-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK0-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK0-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_CASTED]] to i8*
+// CHECK0-NEXT:    store i8 [[TMP2]], i8* [[CONV1]], align 1
+// CHECK0-NEXT:    [[TMP3:%.*]] = load i64, i64* [[AAA_CASTED]], align 8
+// CHECK0-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK0-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[TMP7]], align 8
+// CHECK0-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK0-NEXT:    store i8* null, i8** [[TMP8]], align 8
+// CHECK0-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP3]], i64* [[TMP10]], align 8
+// CHECK0-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP3]], i64* [[TMP12]], align 8
+// CHECK0-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK0-NEXT:    store i8* null, i8** [[TMP13]], align 8
+// CHECK0-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK0-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK0-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 8
+// CHECK0-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK0-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK0-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 8
+// CHECK0-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK0-NEXT:    store i8* null, i8** [[TMP18]], align 8
+// CHECK0-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK0-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK0-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK0-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK0-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK0-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 8
+// CHECK0-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK0-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 8
+// CHECK0-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 8
+// CHECK0-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 8
+// CHECK0-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP27]], align 8
+// CHECK0-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP28]], align 8
+// CHECK0-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK0-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK0-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK0-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK0-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK0-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK0-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK0-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK0-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK0-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK0-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK0-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK0-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK0:       omp_offload.failed:
+// CHECK0-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i64 [[TMP1]], i64 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK0-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK0:       omp_offload.cont:
+// CHECK0-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    ret i32 [[TMP36]]
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK0-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK0-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK0-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8
+// CHECK0-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK0-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK0-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[TMP3]], align 8
+// CHECK0-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK0-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK0-NEXT:    store i8* null, i8** [[TMP6]], align 8
+// CHECK0-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK0-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 8
+// CHECK0-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK0-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK0-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 8
+// CHECK0-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK0-NEXT:    store i8* null, i8** [[TMP11]], align 8
+// CHECK0-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK0-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK0-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK0-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK0-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK0-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 8
+// CHECK0-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK0-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 8
+// CHECK0-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 8
+// CHECK0-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK0-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 8
+// CHECK0-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP20]], align 8
+// CHECK0-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK0-NEXT:    store i8** null, i8*** [[TMP21]], align 8
+// CHECK0-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK0-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK0-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK0-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK0-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK0-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK0-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK0-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK0-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK0-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK0-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK0-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK0-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK0:       omp_offload.failed:
+// CHECK0-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i64 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK0-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK0:       omp_offload.cont:
+// CHECK0-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK0-NEXT:    ret i32 [[TMP29]]
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK0-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK0-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// CHECK0-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK0-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK0-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// CHECK0-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK0-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK0-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// CHECK0-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK0-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK0-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// CHECK0-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK0-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// CHECK0-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK0-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// CHECK0-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK0-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// CHECK0-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK0-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// CHECK0-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK0-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// CHECK0-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK0-NEXT:    store double [[ADD]], double* [[A]], align 8
+// CHECK0-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK0-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// CHECK0-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK0-NEXT:    store double [[INC]], double* [[A5]], align 8
+// CHECK0-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// CHECK0-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// CHECK0-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK0-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// CHECK0-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK0-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK0-NEXT:    ret void
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK0-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK0-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// CHECK0-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK0-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// CHECK0-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK0-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// CHECK0-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// CHECK0-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK0-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK0-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// CHECK0-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK0-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// CHECK0-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// CHECK0-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// CHECK0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// CHECK0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK0-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK0-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// CHECK0-NEXT:    ret void
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK0-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK0-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK0-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK0-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK0-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK0-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK0-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK0-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK0-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK0-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK0-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK0-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// CHECK0-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK0-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK0-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK0-NEXT:    ret void
+//
+//
+// CHECK0-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK0-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK0-NEXT:  entry:
+// CHECK0-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK0-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK1-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK1-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK1-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK1-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK1-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[GA_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [9 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS5:%.*]] = alloca [9 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS6:%.*]] = alloca [9 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS13:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK1-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK1-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK1-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK1-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// CHECK1-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK1-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// CHECK1-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// CHECK1-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// CHECK1-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK1-NEXT:    store i32 [[TMP8]], i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, i32* @ga, align 4
+// CHECK1-NEXT:    [[CONV2:%.*]] = bitcast i64* [[GA_CASTED]] to i32*
+// CHECK1-NEXT:    store i32 [[TMP11]], i32* [[CONV2]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, i64* [[GA_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP9]], i64* [[TMP14]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP9]], i64* [[TMP16]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store i8* null, i8** [[TMP17]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK1-NEXT:    store i32* [[TMP10]], i32** [[TMP19]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32**
+// CHECK1-NEXT:    store i32* [[TMP10]], i32** [[TMP21]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store i8* null, i8** [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP12]], i64* [[TMP24]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP12]], i64* [[TMP26]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store i8* null, i8** [[TMP27]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, i32* [[TMP30]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 3, i32* [[TMP31]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store i8** [[TMP28]], i8*** [[TMP32]], align 8
+// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store i8** [[TMP29]], i8*** [[TMP33]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP34]], align 8
+// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP35]], align 8
+// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP36]], align 8
+// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP37]], align 8
+// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, i64* [[TMP38]], align 8
+// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, i64* [[TMP39]], align 8
+// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP40]], align 4
+// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP41]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, i32* [[TMP42]], align 4
+// CHECK1-NEXT:    [[TMP43:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP44:%.*]] = icmp ne i32 [[TMP43]], 0
+// CHECK1-NEXT:    br i1 [[TMP44]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i64 [[TMP9]], i32* [[TMP10]], i64 [[TMP12]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    [[TMP45:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK1-NEXT:    [[CONV3:%.*]] = bitcast i64* [[AA_CASTED]] to i16*
+// CHECK1-NEXT:    store i16 [[TMP45]], i16* [[CONV3]], align 2
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i64, i64* [[AA_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP47:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK1-NEXT:    [[TMP48:%.*]] = mul nuw i64 5, [[TMP4]]
+// CHECK1-NEXT:    [[TMP49:%.*]] = mul nuw i64 [[TMP48]], 8
+// CHECK1-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP50]], i8* align 8 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i64 72, i1 false)
+// CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP46]], i64* [[TMP52]], align 8
+// CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP46]], i64* [[TMP54]], align 8
+// CHECK1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 0
+// CHECK1-NEXT:    store i8* null, i8** [[TMP55]], align 8
+// CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK1-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 8
+// CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK1-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 8
+// CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 1
+// CHECK1-NEXT:    store i8* null, i8** [[TMP60]], align 8
+// CHECK1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP62]], align 8
+// CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP64]], align 8
+// CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 2
+// CHECK1-NEXT:    store i8* null, i8** [[TMP65]], align 8
+// CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 3
+// CHECK1-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK1-NEXT:    store float* [[VLA]], float** [[TMP67]], align 8
+// CHECK1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 3
+// CHECK1-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK1-NEXT:    store float* [[VLA]], float** [[TMP69]], align 8
+// CHECK1-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK1-NEXT:    store i64 [[TMP47]], i64* [[TMP70]], align 8
+// CHECK1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 3
+// CHECK1-NEXT:    store i8* null, i8** [[TMP71]], align 8
+// CHECK1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 4
+// CHECK1-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK1-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 8
+// CHECK1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 4
+// CHECK1-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK1-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 8
+// CHECK1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 4
+// CHECK1-NEXT:    store i8* null, i8** [[TMP76]], align 8
+// CHECK1-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 5
+// CHECK1-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i64*
+// CHECK1-NEXT:    store i64 5, i64* [[TMP78]], align 8
+// CHECK1-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 5
+// CHECK1-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i64*
+// CHECK1-NEXT:    store i64 5, i64* [[TMP80]], align 8
+// CHECK1-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 5
+// CHECK1-NEXT:    store i8* null, i8** [[TMP81]], align 8
+// CHECK1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 6
+// CHECK1-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP4]], i64* [[TMP83]], align 8
+// CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 6
+// CHECK1-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP4]], i64* [[TMP85]], align 8
+// CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 6
+// CHECK1-NEXT:    store i8* null, i8** [[TMP86]], align 8
+// CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 7
+// CHECK1-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK1-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 8
+// CHECK1-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 7
+// CHECK1-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK1-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 8
+// CHECK1-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK1-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 8
+// CHECK1-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 7
+// CHECK1-NEXT:    store i8* null, i8** [[TMP92]], align 8
+// CHECK1-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 8
+// CHECK1-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK1-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 8
+// CHECK1-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 8
+// CHECK1-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK1-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 8
+// CHECK1-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS6]], i64 0, i64 8
+// CHECK1-NEXT:    store i8* null, i8** [[TMP97]], align 8
+// CHECK1-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK1-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK1-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 2
+// CHECK1-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 8
+// CHECK1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 3
+// CHECK1-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 8
+// CHECK1-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 4
+// CHECK1-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 8
+// CHECK1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 5
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 8
+// CHECK1-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 6
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP107]], align 8
+// CHECK1-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 7
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP108]], align 8
+// CHECK1-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK1-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK1-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK1-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK1-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK1-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS7]])
+// CHECK1-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK1-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
+// CHECK1:       omp_offload.failed8:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i64 [[TMP46]], [10 x float]* [[B]], i64 [[TMP1]], float* [[VLA]], [5 x [10 x double]]* [[C]], i64 5, i64 [[TMP4]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
+// CHECK1:       omp_offload.cont9:
+// CHECK1-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK1-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 8
+// CHECK1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK1-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 8
+// CHECK1-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 0
+// CHECK1-NEXT:    store i8* null, i8** [[TMP121]], align 8
+// CHECK1-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK1-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 8
+// CHECK1-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK1-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 8
+// CHECK1-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 1
+// CHECK1-NEXT:    store i8* null, i8** [[TMP126]], align 8
+// CHECK1-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK1-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK1-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 2
+// CHECK1-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 8
+// CHECK1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 3
+// CHECK1-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 8
+// CHECK1-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 4
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 8
+// CHECK1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 5
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 8
+// CHECK1-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 6
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP135]], align 8
+// CHECK1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 7
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP136]], align 8
+// CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK1-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK1-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK1-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS13]])
+// CHECK1-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK1-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED14:%.*]], label [[OMP_OFFLOAD_CONT15:%.*]]
+// CHECK1:       omp_offload.failed14:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT15]]
+// CHECK1:       omp_offload.cont15:
+// CHECK1-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK1-NEXT:    ret i32 [[TMP144]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// CHECK1-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK1-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK1-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// CHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// CHECK1-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// CHECK1-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// CHECK1-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK1-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK1-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK1-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// CHECK1-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK1-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// CHECK1-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK1-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// CHECK1-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// CHECK1-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// CHECK1-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK1-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// CHECK1-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK1-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK1-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// CHECK1-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK1-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// CHECK1-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// CHECK1-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// CHECK1-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK1-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// CHECK1-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK1-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK1-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK1-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK1-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// CHECK1-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// CHECK1-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// CHECK1-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// CHECK1-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 1, i64* [[X]], align 8
+// CHECK1-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK1-NEXT:    store i8 1, i8* [[Y]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK1-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK1-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// CHECK1-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK1-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK1-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK1-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 8, i1 false)
+// CHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK1-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 0
+// CHECK1-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 0
+// CHECK1-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK1-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// CHECK1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// CHECK1-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK1-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK1-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK1-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK1-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    ret i32 [[TMP9]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK1-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK1-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// CHECK1-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// CHECK1-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_CASTED]] to i32*
+// CHECK1-NEXT:    store i32 [[TMP5]], i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, i64* [[B_CASTED]], align 8
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP7:%.*]] = mul nuw i64 2, [[TMP2]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+// CHECK1-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP9]], i8* align 8 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i64 40, i1 false)
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK1-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK1-NEXT:    store double* [[A]], double** [[TMP13]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store i8* null, i8** [[TMP14]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP6]], i64* [[TMP16]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP6]], i64* [[TMP18]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store i8* null, i8** [[TMP19]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i64*
+// CHECK1-NEXT:    store i64 2, i64* [[TMP21]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i64*
+// CHECK1-NEXT:    store i64 2, i64* [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store i8* null, i8** [[TMP24]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK1-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP2]], i64* [[TMP26]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK1-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP2]], i64* [[TMP28]], align 8
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK1-NEXT:    store i8* null, i8** [[TMP29]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK1-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 8
+// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK1-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK1-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK1-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 8
+// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 4
+// CHECK1-NEXT:    store i8* null, i8** [[TMP35]], align 8
+// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 8
+// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 8
+// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 8
+// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP46]], align 8
+// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK1-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK1-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    [[TMP54:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP54]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK1-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], [[TMP56]]
+// CHECK1-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK1-NEXT:    ret i32 [[ADD4]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK1-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK1-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK1-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_CASTED]] to i8*
+// CHECK1-NEXT:    store i8 [[TMP2]], i8* [[CONV1]], align 1
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, i64* [[AAA_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store i8* null, i8** [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP3]], i64* [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP3]], i64* [[TMP12]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store i8* null, i8** [[TMP13]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store i8* null, i8** [[TMP18]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP27]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP28]], align 8
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK1-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i64 [[TMP1]], i64 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    ret i32 [[TMP36]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK1-NEXT:    store i32 [[TMP0]], i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i64*
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[TMP5]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store i8* null, i8** [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store i8* null, i8** [[TMP11]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP20]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store i8** null, i8*** [[TMP21]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK1-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK1-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i64 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK1-NEXT:    ret i32 [[TMP29]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK1-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// CHECK1-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// CHECK1-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK1-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// CHECK1-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// CHECK1-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// CHECK1-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK1-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT:    store double [[ADD]], double* [[A]], align 8
+// CHECK1-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK1-NEXT:    store double [[INC]], double* [[A5]], align 8
+// CHECK1-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// CHECK1-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// CHECK1-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK1-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// CHECK1-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// CHECK1-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// CHECK1-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// CHECK1-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// CHECK1-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK1-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK1-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// CHECK1-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK1-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK1-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK1-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK2-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK2-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK2-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK2-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK2-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK2-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK2-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[GA_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [9 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [9 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [9 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 4
+// CHECK2-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK2-NEXT:    [[DOTOFFLOAD_BASEPTRS8:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_PTRS9:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_MAPPERS10:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-NEXT:    [[KERNEL_ARGS11:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK2-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK2-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK2-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// CHECK2-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// CHECK2-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK2-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// CHECK2-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK2-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// CHECK2-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// CHECK2-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    store i32 [[TMP6]], i32* [[A_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, i32* @ga, align 4
+// CHECK2-NEXT:    store i32 [[TMP9]], i32* [[GA_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[GA_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP7]], i32* [[TMP12]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP7]], i32* [[TMP14]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK2-NEXT:    store i8* null, i8** [[TMP15]], align 4
+// CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to i32**
+// CHECK2-NEXT:    store i32* [[TMP8]], i32** [[TMP17]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK2-NEXT:    store i32* [[TMP8]], i32** [[TMP19]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK2-NEXT:    store i8* null, i8** [[TMP20]], align 4
+// CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP10]], i32* [[TMP22]], align 4
+// CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP10]], i32* [[TMP24]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK2-NEXT:    store i8* null, i8** [[TMP25]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 2, i32* [[TMP28]], align 4
+// CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-NEXT:    store i32 3, i32* [[TMP29]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-NEXT:    store i8** [[TMP26]], i8*** [[TMP30]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-NEXT:    store i8** [[TMP27]], i8*** [[TMP31]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP32]], align 4
+// CHECK2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP33]], align 4
+// CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP34]], align 4
+// CHECK2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP35]], align 4
+// CHECK2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-NEXT:    store i64 0, i64* [[TMP36]], align 8
+// CHECK2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-NEXT:    store i64 0, i64* [[TMP37]], align 8
+// CHECK2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP38]], align 4
+// CHECK2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP39]], align 4
+// CHECK2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-NEXT:    store i32 0, i32* [[TMP40]], align 4
+// CHECK2-NEXT:    [[TMP41:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK2-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
+// CHECK2-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2:       omp_offload.failed:
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i32 [[TMP7]], i32* [[TMP8]], i32 [[TMP10]]) #[[ATTR3:[0-9]+]]
+// CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK2:       omp_offload.cont:
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_CASTED]] to i16*
+// CHECK2-NEXT:    store i16 [[TMP43]], i16* [[CONV]], align 2
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i32, i32* [[AA_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP45:%.*]] = mul nuw i32 [[TMP0]], 4
+// CHECK2-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP45]] to i64
+// CHECK2-NEXT:    [[TMP47:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK2-NEXT:    [[TMP48:%.*]] = mul nuw i32 [[TMP47]], 8
+// CHECK2-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP48]] to i64
+// CHECK2-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP50]], i8* align 4 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i32 72, i1 false)
+// CHECK2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP44]], i32* [[TMP52]], align 4
+// CHECK2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP44]], i32* [[TMP54]], align 4
+// CHECK2-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CHECK2-NEXT:    store i8* null, i8** [[TMP55]], align 4
+// CHECK2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK2-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 4
+// CHECK2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK2-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 4
+// CHECK2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 1
+// CHECK2-NEXT:    store i8* null, i8** [[TMP60]], align 4
+// CHECK2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP0]], i32* [[TMP62]], align 4
+// CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP0]], i32* [[TMP64]], align 4
+// CHECK2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 2
+// CHECK2-NEXT:    store i8* null, i8** [[TMP65]], align 4
+// CHECK2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 3
+// CHECK2-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK2-NEXT:    store float* [[VLA]], float** [[TMP67]], align 4
+// CHECK2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 3
+// CHECK2-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK2-NEXT:    store float* [[VLA]], float** [[TMP69]], align 4
+// CHECK2-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK2-NEXT:    store i64 [[TMP46]], i64* [[TMP70]], align 4
+// CHECK2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 3
+// CHECK2-NEXT:    store i8* null, i8** [[TMP71]], align 4
+// CHECK2-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 4
+// CHECK2-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK2-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 4
+// CHECK2-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 4
+// CHECK2-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK2-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 4
+// CHECK2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 4
+// CHECK2-NEXT:    store i8* null, i8** [[TMP76]], align 4
+// CHECK2-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 5
+// CHECK2-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i32*
+// CHECK2-NEXT:    store i32 5, i32* [[TMP78]], align 4
+// CHECK2-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 5
+// CHECK2-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i32*
+// CHECK2-NEXT:    store i32 5, i32* [[TMP80]], align 4
+// CHECK2-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 5
+// CHECK2-NEXT:    store i8* null, i8** [[TMP81]], align 4
+// CHECK2-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 6
+// CHECK2-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP2]], i32* [[TMP83]], align 4
+// CHECK2-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 6
+// CHECK2-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP2]], i32* [[TMP85]], align 4
+// CHECK2-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 6
+// CHECK2-NEXT:    store i8* null, i8** [[TMP86]], align 4
+// CHECK2-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 7
+// CHECK2-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK2-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 4
+// CHECK2-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 7
+// CHECK2-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK2-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 4
+// CHECK2-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK2-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 4
+// CHECK2-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 7
+// CHECK2-NEXT:    store i8* null, i8** [[TMP92]], align 4
+// CHECK2-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 8
+// CHECK2-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK2-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 4
+// CHECK2-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 8
+// CHECK2-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK2-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 4
+// CHECK2-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 8
+// CHECK2-NEXT:    store i8* null, i8** [[TMP97]], align 4
+// CHECK2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK2-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK2-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK2-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK2-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 4
+// CHECK2-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK2-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 4
+// CHECK2-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK2-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 4
+// CHECK2-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 4
+// CHECK2-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP107]], align 4
+// CHECK2-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP108]], align 4
+// CHECK2-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK2-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK2-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK2-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK2-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK2-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK2-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK2-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK2-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK2-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK2-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]])
+// CHECK2-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK2-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK2:       omp_offload.failed6:
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i32 [[TMP44]], [10 x float]* [[B]], i32 [[TMP0]], float* [[VLA]], [5 x [10 x double]]* [[C]], i32 5, i32 [[TMP2]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK2:       omp_offload.cont7:
+// CHECK2-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK2-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 4
+// CHECK2-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK2-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 4
+// CHECK2-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 0
+// CHECK2-NEXT:    store i8* null, i8** [[TMP121]], align 4
+// CHECK2-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK2-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 4
+// CHECK2-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK2-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 4
+// CHECK2-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 1
+// CHECK2-NEXT:    store i8* null, i8** [[TMP126]], align 4
+// CHECK2-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK2-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 1
+// CHECK2-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK2-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 2
+// CHECK2-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 4
+// CHECK2-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 3
+// CHECK2-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 4
+// CHECK2-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 4
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 4
+// CHECK2-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 5
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 4
+// CHECK2-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 6
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP135]], align 4
+// CHECK2-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 7
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP136]], align 4
+// CHECK2-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 8
+// CHECK2-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK2-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 9
+// CHECK2-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK2-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 10
+// CHECK2-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK2-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 11
+// CHECK2-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK2-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 12
+// CHECK2-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK2-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]])
+// CHECK2-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK2-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED12:%.*]], label [[OMP_OFFLOAD_CONT13:%.*]]
+// CHECK2:       omp_offload.failed12:
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT13]]
+// CHECK2:       omp_offload.cont13:
+// CHECK2-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK2-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK2-NEXT:    ret i32 [[TMP144]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK2-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// CHECK2-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK2-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK2-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// CHECK2-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// CHECK2-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// CHECK2-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK2-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK2-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK2-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// CHECK2-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK2-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// CHECK2-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK2-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// CHECK2-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// CHECK2-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// CHECK2-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK2-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// CHECK2-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK2-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// CHECK2-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK2-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// CHECK2-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK2-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// CHECK2-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// CHECK2-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// CHECK2-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// CHECK2-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK2-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// CHECK2-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK2-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK2-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK2-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK2-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// CHECK2-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK2-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// CHECK2-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// CHECK2-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// CHECK2-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK2-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// CHECK2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// CHECK2-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// CHECK2-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK2-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK2-NEXT:    store i64 1, i64* [[X]], align 4
+// CHECK2-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK2-NEXT:    store i8 1, i8* [[Y]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK2-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK2-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK2-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// CHECK2-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK2-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK2-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK2-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 8, i1 false)
+// CHECK2-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK2-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i32 0
+// CHECK2-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i32 0
+// CHECK2-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK2-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 4
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK2-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// CHECK2-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK2-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK2-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK2-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK2-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK2-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    ret i32 [[TMP9]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK2-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 4
+// CHECK2-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK2-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK2-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK2-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// CHECK2-NEXT:    store i32 [[TMP4]], i32* [[B_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B_CASTED]], align 4
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP6:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK2-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+// CHECK2-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i32 40, i1 false)
+// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK2-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK2-NEXT:    store double* [[A]], double** [[TMP13]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK2-NEXT:    store i8* null, i8** [[TMP14]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP5]], i32* [[TMP16]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP5]], i32* [[TMP18]], align 4
+// CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK2-NEXT:    store i8* null, i8** [[TMP19]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32*
+// CHECK2-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i32*
+// CHECK2-NEXT:    store i32 2, i32* [[TMP23]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK2-NEXT:    store i8* null, i8** [[TMP24]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK2-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[TMP26]], align 4
+// CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK2-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[TMP28]], align 4
+// CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK2-NEXT:    store i8* null, i8** [[TMP29]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK2-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK2-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK2-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 4
+// CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK2-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 4
+// CHECK2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 4
+// CHECK2-NEXT:    store i8* null, i8** [[TMP35]], align 4
+// CHECK2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 4
+// CHECK2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 4
+// CHECK2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 4
+// CHECK2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP45]], align 4
+// CHECK2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP46]], align 4
+// CHECK2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK2-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK2-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK2-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2:       omp_offload.failed:
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK2:       omp_offload.cont:
+// CHECK2-NEXT:    [[TMP54:%.*]] = mul nsw i32 1, [[TMP1]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP54]]
+// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK2-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], [[TMP56]]
+// CHECK2-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK2-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK2-NEXT:    ret i32 [[ADD3]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK2-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK2-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK2-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK2-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK2-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_CASTED]] to i8*
+// CHECK2-NEXT:    store i8 [[TMP2]], i8* [[CONV]], align 1
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[AAA_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[TMP7]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK2-NEXT:    store i8* null, i8** [[TMP8]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP3]], i32* [[TMP10]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP3]], i32* [[TMP12]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK2-NEXT:    store i8* null, i8** [[TMP13]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 4
+// CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK2-NEXT:    store i8* null, i8** [[TMP18]], align 4
+// CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 4
+// CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP27]], align 4
+// CHECK2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP28]], align 4
+// CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK2-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK2-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK2-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2:       omp_offload.failed:
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i32 [[TMP1]], i32 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK2:       omp_offload.cont:
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    ret i32 [[TMP36]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK2-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK2-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 4
+// CHECK2-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK2-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[TMP3]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK2-NEXT:    store i8* null, i8** [[TMP6]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK2-NEXT:    store i8* null, i8** [[TMP11]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 4
+// CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP20]], align 4
+// CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-NEXT:    store i8** null, i8*** [[TMP21]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK2-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK2-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK2-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2:       omp_offload.failed:
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i32 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK2:       omp_offload.cont:
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK2-NEXT:    ret i32 [[TMP29]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK2-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// CHECK2-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK2-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK2-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// CHECK2-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK2-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK2-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK2-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK2-NEXT:    store double [[ADD]], double* [[A]], align 4
+// CHECK2-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK2-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK2-NEXT:    store double [[INC]], double* [[A4]], align 4
+// CHECK2-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// CHECK2-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK2-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK2-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK2-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK2-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// CHECK2-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK2-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK2-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// CHECK2-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// CHECK2-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK2-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK2-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK2-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK2-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK2-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK2-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK2-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z3fooiPd
+// CHECK3-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// CHECK3-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// CHECK3-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK3-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK3-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK3-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK3-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[GA_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [9 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [9 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [9 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS8:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS9:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS10:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS11:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK3-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK3-NEXT:    store i16 0, i16* [[AA]], align 2
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// CHECK3-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK3-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// CHECK3-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// CHECK3-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// CHECK3-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    store i32 [[TMP6]], i32* [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32*, i32** [[P]], align 64
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* @ga, align 4
+// CHECK3-NEXT:    store i32 [[TMP9]], i32* [[GA_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[GA_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP7]], i32* [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP14:%.*]] = bitcast i8** [[TMP13]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP7]], i32* [[TMP14]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store i8* null, i8** [[TMP15]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to i32**
+// CHECK3-NEXT:    store i32* [[TMP8]], i32** [[TMP17]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP19:%.*]] = bitcast i8** [[TMP18]] to i32**
+// CHECK3-NEXT:    store i32* [[TMP8]], i32** [[TMP19]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store i8* null, i8** [[TMP20]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP10]], i32* [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP24:%.*]] = bitcast i8** [[TMP23]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP10]], i32* [[TMP24]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store i8* null, i8** [[TMP25]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, i32* [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 3, i32* [[TMP29]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store i8** [[TMP26]], i8*** [[TMP30]], align 4
+// CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store i8** [[TMP27]], i8*** [[TMP31]], align 4
+// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes, i32 0, i32 0), i64** [[TMP32]], align 4
+// CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP33]], align 4
+// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP34]], align 4
+// CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP35]], align 4
+// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, i64* [[TMP36]], align 8
+// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, i64* [[TMP37]], align 8
+// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP38]], align 4
+// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP39]], align 4
+// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, i32* [[TMP40]], align 4
+// CHECK3-NEXT:    [[TMP41:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
+// CHECK3-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63(i32 [[TMP7]], i32* [[TMP8]], i32 [[TMP10]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    [[TMP43:%.*]] = load i16, i16* [[AA]], align 2
+// CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_CASTED]] to i16*
+// CHECK3-NEXT:    store i16 [[TMP43]], i16* [[CONV]], align 2
+// CHECK3-NEXT:    [[TMP44:%.*]] = load i32, i32* [[AA_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP45:%.*]] = mul nuw i32 [[TMP0]], 4
+// CHECK3-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP45]] to i64
+// CHECK3-NEXT:    [[TMP47:%.*]] = mul nuw i32 5, [[TMP2]]
+// CHECK3-NEXT:    [[TMP48:%.*]] = mul nuw i32 [[TMP47]], 8
+// CHECK3-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP48]] to i64
+// CHECK3-NEXT:    [[TMP50:%.*]] = bitcast [9 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP50]], i8* align 4 bitcast ([9 x i64]* @.offload_sizes.1 to i8*), i32 72, i1 false)
+// CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP52:%.*]] = bitcast i8** [[TMP51]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP44]], i32* [[TMP52]], align 4
+// CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP54:%.*]] = bitcast i8** [[TMP53]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP44]], i32* [[TMP54]], align 4
+// CHECK3-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CHECK3-NEXT:    store i8* null, i8** [[TMP55]], align 4
+// CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP57:%.*]] = bitcast i8** [[TMP56]] to [10 x float]**
+// CHECK3-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP57]], align 4
+// CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to [10 x float]**
+// CHECK3-NEXT:    store [10 x float]* [[B]], [10 x float]** [[TMP59]], align 4
+// CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 1
+// CHECK3-NEXT:    store i8* null, i8** [[TMP60]], align 4
+// CHECK3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP0]], i32* [[TMP62]], align 4
+// CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP64:%.*]] = bitcast i8** [[TMP63]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP0]], i32* [[TMP64]], align 4
+// CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 2
+// CHECK3-NEXT:    store i8* null, i8** [[TMP65]], align 4
+// CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP67:%.*]] = bitcast i8** [[TMP66]] to float**
+// CHECK3-NEXT:    store float* [[VLA]], float** [[TMP67]], align 4
+// CHECK3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP69:%.*]] = bitcast i8** [[TMP68]] to float**
+// CHECK3-NEXT:    store float* [[VLA]], float** [[TMP69]], align 4
+// CHECK3-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK3-NEXT:    store i64 [[TMP46]], i64* [[TMP70]], align 4
+// CHECK3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 3
+// CHECK3-NEXT:    store i8* null, i8** [[TMP71]], align 4
+// CHECK3-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 4
+// CHECK3-NEXT:    [[TMP73:%.*]] = bitcast i8** [[TMP72]] to [5 x [10 x double]]**
+// CHECK3-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP73]], align 4
+// CHECK3-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 4
+// CHECK3-NEXT:    [[TMP75:%.*]] = bitcast i8** [[TMP74]] to [5 x [10 x double]]**
+// CHECK3-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[TMP75]], align 4
+// CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 4
+// CHECK3-NEXT:    store i8* null, i8** [[TMP76]], align 4
+// CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 5
+// CHECK3-NEXT:    [[TMP78:%.*]] = bitcast i8** [[TMP77]] to i32*
+// CHECK3-NEXT:    store i32 5, i32* [[TMP78]], align 4
+// CHECK3-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 5
+// CHECK3-NEXT:    [[TMP80:%.*]] = bitcast i8** [[TMP79]] to i32*
+// CHECK3-NEXT:    store i32 5, i32* [[TMP80]], align 4
+// CHECK3-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 5
+// CHECK3-NEXT:    store i8* null, i8** [[TMP81]], align 4
+// CHECK3-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 6
+// CHECK3-NEXT:    [[TMP83:%.*]] = bitcast i8** [[TMP82]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP2]], i32* [[TMP83]], align 4
+// CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 6
+// CHECK3-NEXT:    [[TMP85:%.*]] = bitcast i8** [[TMP84]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP2]], i32* [[TMP85]], align 4
+// CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 6
+// CHECK3-NEXT:    store i8* null, i8** [[TMP86]], align 4
+// CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 7
+// CHECK3-NEXT:    [[TMP88:%.*]] = bitcast i8** [[TMP87]] to double**
+// CHECK3-NEXT:    store double* [[VLA1]], double** [[TMP88]], align 4
+// CHECK3-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 7
+// CHECK3-NEXT:    [[TMP90:%.*]] = bitcast i8** [[TMP89]] to double**
+// CHECK3-NEXT:    store double* [[VLA1]], double** [[TMP90]], align 4
+// CHECK3-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK3-NEXT:    store i64 [[TMP49]], i64* [[TMP91]], align 4
+// CHECK3-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 7
+// CHECK3-NEXT:    store i8* null, i8** [[TMP92]], align 4
+// CHECK3-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 8
+// CHECK3-NEXT:    [[TMP94:%.*]] = bitcast i8** [[TMP93]] to %struct.TT**
+// CHECK3-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP94]], align 4
+// CHECK3-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 8
+// CHECK3-NEXT:    [[TMP96:%.*]] = bitcast i8** [[TMP95]] to %struct.TT**
+// CHECK3-NEXT:    store %struct.TT* [[D]], %struct.TT** [[TMP96]], align 4
+// CHECK3-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 8
+// CHECK3-NEXT:    store i8* null, i8** [[TMP97]], align 4
+// CHECK3-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, i32* [[TMP101]], align 4
+// CHECK3-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 9, i32* [[TMP102]], align 4
+// CHECK3-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK3-NEXT:    store i8** [[TMP98]], i8*** [[TMP103]], align 4
+// CHECK3-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK3-NEXT:    store i8** [[TMP99]], i8*** [[TMP104]], align 4
+// CHECK3-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK3-NEXT:    store i64* [[TMP100]], i64** [[TMP105]], align 4
+// CHECK3-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([9 x i64], [9 x i64]* @.offload_maptypes.2, i32 0, i32 0), i64** [[TMP106]], align 4
+// CHECK3-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP107]], align 4
+// CHECK3-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP108]], align 4
+// CHECK3-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, i64* [[TMP109]], align 8
+// CHECK3-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, i64* [[TMP110]], align 8
+// CHECK3-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP111]], align 4
+// CHECK3-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP112]], align 4
+// CHECK3-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, i32* [[TMP113]], align 4
+// CHECK3-NEXT:    [[TMP114:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS5]])
+// CHECK3-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
+// CHECK3-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK3:       omp_offload.failed6:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70(i32 [[TMP44]], [10 x float]* [[B]], i32 [[TMP0]], float* [[VLA]], [5 x [10 x double]]* [[C]], i32 5, i32 [[TMP2]], double* [[VLA1]], %struct.TT* [[D]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK3:       omp_offload.cont7:
+// CHECK3-NEXT:    [[TMP116:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP118:%.*]] = bitcast i8** [[TMP117]] to double**
+// CHECK3-NEXT:    store double* [[TMP116]], double** [[TMP118]], align 4
+// CHECK3-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP120:%.*]] = bitcast i8** [[TMP119]] to double**
+// CHECK3-NEXT:    store double* [[TMP116]], double** [[TMP120]], align 4
+// CHECK3-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 0
+// CHECK3-NEXT:    store i8* null, i8** [[TMP121]], align 4
+// CHECK3-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP123:%.*]] = bitcast i8** [[TMP122]] to %struct.TT.0**
+// CHECK3-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP123]], align 4
+// CHECK3-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP125:%.*]] = bitcast i8** [[TMP124]] to %struct.TT.0**
+// CHECK3-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[TMP125]], align 4
+// CHECK3-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 1
+// CHECK3-NEXT:    store i8* null, i8** [[TMP126]], align 4
+// CHECK3-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, i32* [[TMP129]], align 4
+// CHECK3-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 2, i32* [[TMP130]], align 4
+// CHECK3-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 2
+// CHECK3-NEXT:    store i8** [[TMP127]], i8*** [[TMP131]], align 4
+// CHECK3-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 3
+// CHECK3-NEXT:    store i8** [[TMP128]], i8*** [[TMP132]], align 4
+// CHECK3-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 4
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.3, i32 0, i32 0), i64** [[TMP133]], align 4
+// CHECK3-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 5
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i32 0, i32 0), i64** [[TMP134]], align 4
+// CHECK3-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 6
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP135]], align 4
+// CHECK3-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 7
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP136]], align 4
+// CHECK3-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, i64* [[TMP137]], align 8
+// CHECK3-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, i64* [[TMP138]], align 8
+// CHECK3-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP139]], align 4
+// CHECK3-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP140]], align 4
+// CHECK3-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, i32* [[TMP141]], align 4
+// CHECK3-NEXT:    [[TMP142:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS11]])
+// CHECK3-NEXT:    [[TMP143:%.*]] = icmp ne i32 [[TMP142]], 0
+// CHECK3-NEXT:    br i1 [[TMP143]], label [[OMP_OFFLOAD_FAILED12:%.*]], label [[OMP_OFFLOAD_CONT13:%.*]]
+// CHECK3:       omp_offload.failed12:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111(double* [[TMP116]], %struct.TT.0* [[E]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT13]]
+// CHECK3:       omp_offload.cont13:
+// CHECK3-NEXT:    [[TMP144:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    [[TMP145:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    call void @llvm.stackrestore(i8* [[TMP145]])
+// CHECK3-NEXT:    ret i32 [[TMP144]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// CHECK3-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// CHECK3-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// CHECK3-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// CHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// CHECK3-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// CHECK3-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// CHECK3-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// CHECK3-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK3-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK3-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK3-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// CHECK3-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK3-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// CHECK3-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK3-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// CHECK3-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// CHECK3-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// CHECK3-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// CHECK3-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// CHECK3-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// CHECK3-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// CHECK3-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// CHECK3-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// CHECK3-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// CHECK3-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK3-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// CHECK3-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// CHECK3-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// CHECK3-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// CHECK3-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// CHECK3-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// CHECK3-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// CHECK3-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// CHECK3-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK3-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// CHECK3-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK3-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// CHECK3-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// CHECK3-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// CHECK3-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// CHECK3-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// CHECK3-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// CHECK3-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// CHECK3-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// CHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// CHECK3-NEXT:    store i64 1, i64* [[X]], align 4
+// CHECK3-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// CHECK3-NEXT:    store i8 1, i8* [[Y]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// CHECK3-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR2]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK3-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// CHECK3-NEXT:    [[E1:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// CHECK3-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK3-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = bitcast %struct.TT.0* [[E1]] to i8*
+// CHECK3-NEXT:    [[TMP2:%.*]] = bitcast %struct.TT.0* [[TMP0]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 8, i1 false)
+// CHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[X]], align 4
+// CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP3]] to double
+// CHECK3-NEXT:    [[TMP4:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP4]], i32 0
+// CHECK3-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[TMP5]], i32 0
+// CHECK3-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX2]], align 4
+// CHECK3-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// CHECK3-NEXT:    store double [[INC]], double* [[ARRAYIDX2]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z3bariPd
+// CHECK3-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// CHECK3-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// CHECK3-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// CHECK3-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// CHECK3-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// CHECK3-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// CHECK3-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP9]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK3-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK3-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK3-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK3-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK3-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// CHECK3-NEXT:    store i32 [[TMP4]], i32* [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP6:%.*]] = mul nuw i32 2, [[TMP1]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK3-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+// CHECK3-NEXT:    [[TMP9:%.*]] = bitcast [5 x i64]* [[DOTOFFLOAD_SIZES]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 bitcast ([5 x i64]* @.offload_sizes.5 to i8*), i32 40, i1 false)
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP11:%.*]] = bitcast i8** [[TMP10]] to %struct.S1**
+// CHECK3-NEXT:    store %struct.S1* [[THIS1]], %struct.S1** [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = bitcast i8** [[TMP12]] to double**
+// CHECK3-NEXT:    store double* [[A]], double** [[TMP13]], align 4
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store i8* null, i8** [[TMP14]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP16:%.*]] = bitcast i8** [[TMP15]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP5]], i32* [[TMP16]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP18:%.*]] = bitcast i8** [[TMP17]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP5]], i32* [[TMP18]], align 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store i8* null, i8** [[TMP19]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP21:%.*]] = bitcast i8** [[TMP20]] to i32*
+// CHECK3-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i32*
+// CHECK3-NEXT:    store i32 2, i32* [[TMP23]], align 4
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store i8* null, i8** [[TMP24]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP26:%.*]] = bitcast i8** [[TMP25]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP26]], align 4
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP28:%.*]] = bitcast i8** [[TMP27]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK3-NEXT:    store i8* null, i8** [[TMP29]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK3-NEXT:    [[TMP31:%.*]] = bitcast i8** [[TMP30]] to i16**
+// CHECK3-NEXT:    store i16* [[VLA]], i16** [[TMP31]], align 4
+// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK3-NEXT:    [[TMP33:%.*]] = bitcast i8** [[TMP32]] to i16**
+// CHECK3-NEXT:    store i16* [[VLA]], i16** [[TMP33]], align 4
+// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK3-NEXT:    store i64 [[TMP8]], i64* [[TMP34]], align 4
+// CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 4
+// CHECK3-NEXT:    store i8* null, i8** [[TMP35]], align 4
+// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [5 x i64], [5 x i64]* [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, i32* [[TMP39]], align 4
+// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 5, i32* [[TMP40]], align 4
+// CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store i8** [[TMP36]], i8*** [[TMP41]], align 4
+// CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store i8** [[TMP37]], i8*** [[TMP42]], align 4
+// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store i64* [[TMP38]], i64** [[TMP43]], align 4
+// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([5 x i64], [5 x i64]* @.offload_maptypes.6, i32 0, i32 0), i64** [[TMP44]], align 4
+// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP45]], align 4
+// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP46]], align 4
+// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, i64* [[TMP47]], align 8
+// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, i64* [[TMP48]], align 8
+// CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP49]], align 4
+// CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP50]], align 4
+// CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, i32* [[TMP51]], align 4
+// CHECK3-NEXT:    [[TMP52:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK3-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167(%struct.S1* [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], i16* [[VLA]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    [[TMP54:%.*]] = mul nsw i32 1, [[TMP1]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP54]]
+// CHECK3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK3-NEXT:    [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2
+// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP55]] to i32
+// CHECK3-NEXT:    [[TMP56:%.*]] = load i32, i32* [[B]], align 4
+// CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], [[TMP56]]
+// CHECK3-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    call void @llvm.stackrestore(i8* [[TMP57]])
+// CHECK3-NEXT:    ret i32 [[ADD3]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// CHECK3-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x i8*], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK3-NEXT:    store i8 0, i8* [[AAA]], align 1
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i8, i8* [[AAA]], align 1
+// CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_CASTED]] to i8*
+// CHECK3-NEXT:    store i8 [[TMP2]], i8* [[CONV]], align 1
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[AAA_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP7]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store i8* null, i8** [[TMP8]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP3]], i32* [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP3]], i32* [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store i8* null, i8** [[TMP13]], align 4
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to [10 x i32]**
+// CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP15]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP17:%.*]] = bitcast i8** [[TMP16]] to [10 x i32]**
+// CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP17]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store i8* null, i8** [[TMP18]], align 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, i32* [[TMP21]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 3, i32* [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store i8** [[TMP19]], i8*** [[TMP23]], align 4
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store i8** [[TMP20]], i8*** [[TMP24]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_sizes.7, i32 0, i32 0), i64** [[TMP25]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([3 x i64], [3 x i64]* @.offload_maptypes.8, i32 0, i32 0), i64** [[TMP26]], align 4
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP27]], align 4
+// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, i64* [[TMP29]], align 8
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, i64* [[TMP30]], align 8
+// CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP31]], align 4
+// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP32]], align 4
+// CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, i32* [[TMP33]], align 4
+// CHECK3-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK3-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142(i32 [[TMP1]], i32 [[TMP3]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    [[TMP36:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP36]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 0, i32* [[A]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], i32* [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP3]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to i32*
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store i8* null, i8** [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP8:%.*]] = bitcast i8** [[TMP7]] to [10 x i32]**
+// CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP8]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to [10 x i32]**
+// CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store i8* null, i8** [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, i32* [[TMP14]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 2, i32* [[TMP15]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store i8** [[TMP12]], i8*** [[TMP16]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store i8** [[TMP13]], i8*** [[TMP17]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.9, i32 0, i32 0), i64** [[TMP18]], align 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.10, i32 0, i32 0), i64** [[TMP19]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP20]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store i8** null, i8*** [[TMP21]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, i64* [[TMP22]], align 8
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, i64* [[TMP23]], align 8
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], [3 x i32]* [[TMP24]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, i32* [[TMP26]], align 4
+// CHECK3-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB1]], i64 -1, i32 -1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK3-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128(i32 [[TMP1]], [10 x i32]* [[B]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    [[TMP29:%.*]] = load i32, i32* [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP29]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// CHECK3-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// CHECK3-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK3-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// CHECK3-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// CHECK3-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK3-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// CHECK3-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK3-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// CHECK3-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// CHECK3-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT:    store double [[ADD]], double* [[A]], align 4
+// CHECK3-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// CHECK3-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// CHECK3-NEXT:    store double [[INC]], double* [[A4]], align 4
+// CHECK3-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// CHECK3-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// CHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// CHECK3-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK3-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// CHECK3-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK3-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK3-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// CHECK3-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// CHECK3-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK3-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// CHECK3-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// CHECK3-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// CHECK3-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK3-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
+// CHECK3-NEXT:    ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z3fooiPd
+// SIMD-ONLY0-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// SIMD-ONLY0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// SIMD-ONLY0-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// SIMD-ONLY0-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// SIMD-ONLY0-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// SIMD-ONLY0-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// SIMD-ONLY0-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// SIMD-ONLY0-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// SIMD-ONLY0-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// SIMD-ONLY0-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// SIMD-ONLY0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    store i16 0, i16* [[AA]], align 2
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY0-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY0-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// SIMD-ONLY0-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// SIMD-ONLY0-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// SIMD-ONLY0-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// SIMD-ONLY0-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// SIMD-ONLY0-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// SIMD-ONLY0-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// SIMD-ONLY0-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// SIMD-ONLY0-NEXT:    [[TMP8:%.*]] = load i16, i16* [[AA]], align 2
+// SIMD-ONLY0-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY0-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD]] to i16
+// SIMD-ONLY0-NEXT:    store i16 [[CONV2]], i16* [[AA]], align 2
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i64 0, i64 2
+// SIMD-ONLY0-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[VLA]], i64 3
+// SIMD-ONLY0-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX3]], align 4
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i64 0, i64 1
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX4]], i64 0, i64 2
+// SIMD-ONLY0-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX5]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP9:%.*]] = mul nsw i64 1, [[TMP4]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[VLA1]], i64 [[TMP9]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX6]], i64 3
+// SIMD-ONLY0-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX7]], align 8
+// SIMD-ONLY0-NEXT:    [[X8:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    store i64 1, i64* [[X8]], align 8
+// SIMD-ONLY0-NEXT:    [[Y9:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 1
+// SIMD-ONLY0-NEXT:    store i8 1, i8* [[Y9]], align 8
+// SIMD-ONLY0-NEXT:    [[X10:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP10:%.*]] = load i32, i32* [[X10]], align 4
+// SIMD-ONLY0-NEXT:    [[CONV11:%.*]] = sitofp i32 [[TMP10]] to double
+// SIMD-ONLY0-NEXT:    [[TMP11:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[TMP11]], i64 0
+// SIMD-ONLY0-NEXT:    store double [[CONV11]], double* [[ARRAYIDX12]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP12:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[TMP12]], i64 0
+// SIMD-ONLY0-NEXT:    [[TMP13:%.*]] = load double, double* [[ARRAYIDX13]], align 8
+// SIMD-ONLY0-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
+// SIMD-ONLY0-NEXT:    store double [[INC]], double* [[ARRAYIDX13]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY0-NEXT:    call void @llvm.stackrestore(i8* [[TMP15]])
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP14]]
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z3bariPd
+// SIMD-ONLY0-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// SIMD-ONLY0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// SIMD-ONLY0-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// SIMD-ONLY0-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP9]]
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// SIMD-ONLY0-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// SIMD-ONLY0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[B:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// SIMD-ONLY0-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// SIMD-ONLY0-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY0-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// SIMD-ONLY0-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// SIMD-ONLY0-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY0-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP5]] to double
+// SIMD-ONLY0-NEXT:    [[ADD2:%.*]] = fadd double [[CONV]], 1.500000e+00
+// SIMD-ONLY0-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    store double [[ADD2]], double* [[A]], align 8
+// SIMD-ONLY0-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = load double, double* [[A3]], align 8
+// SIMD-ONLY0-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// SIMD-ONLY0-NEXT:    store double [[INC]], double* [[A3]], align 8
+// SIMD-ONLY0-NEXT:    [[CONV4:%.*]] = fptosi double [[INC]] to i16
+// SIMD-ONLY0-NEXT:    [[TMP7:%.*]] = mul nsw i64 1, [[TMP2]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP7]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// SIMD-ONLY0-NEXT:    store i16 [[CONV4]], i16* [[ARRAYIDX5]], align 2
+// SIMD-ONLY0-NEXT:    [[TMP8:%.*]] = mul nsw i64 1, [[TMP2]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP8]]
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX6]], i64 1
+// SIMD-ONLY0-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+// SIMD-ONLY0-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP9]] to i32
+// SIMD-ONLY0-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], [[TMP10]]
+// SIMD-ONLY0-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY0-NEXT:    call void @llvm.stackrestore(i8* [[TMP11]])
+// SIMD-ONLY0-NEXT:    ret i32 [[ADD9]]
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZL7fstatici
+// SIMD-ONLY0-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// SIMD-ONLY0-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    store i8 0, i8* [[AAA]], align 1
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i8, i8* [[AAA]], align 1
+// SIMD-ONLY0-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+// SIMD-ONLY0-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY0-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i8
+// SIMD-ONLY0-NEXT:    store i8 [[CONV2]], i8* [[AAA]], align 1
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i64 0, i64 2
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP3]]
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// SIMD-ONLY0-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY0-NEXT:  entry:
+// SIMD-ONLY0-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY0-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY0-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i64 0, i64 2
+// SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY0-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY0-NEXT:    store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY0-NEXT:    ret i32 [[TMP2]]
+//
+//
+// SIMD-ONLY01-LABEL: define {{[^@]+}}@_Z3fooiPd
+// SIMD-ONLY01-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY01-NEXT:  entry:
+// SIMD-ONLY01-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// SIMD-ONLY01-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// SIMD-ONLY01-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// SIMD-ONLY01-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// SIMD-ONLY01-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// SIMD-ONLY01-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// SIMD-ONLY01-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// SIMD-ONLY01-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// SIMD-ONLY01-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// SIMD-ONLY01-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// SIMD-ONLY01-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    store i16 0, i16* [[AA]], align 2
+// SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// SIMD-ONLY01-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY01-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY01-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// SIMD-ONLY01-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// SIMD-ONLY01-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// SIMD-ONLY01-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// SIMD-ONLY01-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// SIMD-ONLY01-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY01-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// SIMD-ONLY01-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// SIMD-ONLY01-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// SIMD-ONLY01-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// SIMD-ONLY01-NEXT:    [[TMP8:%.*]] = load i16, i16* [[AA]], align 2
+// SIMD-ONLY01-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
+// SIMD-ONLY01-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY01-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD]] to i16
+// SIMD-ONLY01-NEXT:    store i16 [[CONV2]], i16* [[AA]], align 2
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i64 0, i64 2
+// SIMD-ONLY01-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[VLA]], i64 3
+// SIMD-ONLY01-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX3]], align 4
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i64 0, i64 1
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX4]], i64 0, i64 2
+// SIMD-ONLY01-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX5]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP9:%.*]] = mul nsw i64 1, [[TMP4]]
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[VLA1]], i64 [[TMP9]]
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX6]], i64 3
+// SIMD-ONLY01-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX7]], align 8
+// SIMD-ONLY01-NEXT:    [[X8:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 0
+// SIMD-ONLY01-NEXT:    store i64 1, i64* [[X8]], align 8
+// SIMD-ONLY01-NEXT:    [[Y9:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 1
+// SIMD-ONLY01-NEXT:    store i8 1, i8* [[Y9]], align 8
+// SIMD-ONLY01-NEXT:    [[X10:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY01-NEXT:    [[TMP10:%.*]] = load i32, i32* [[X10]], align 4
+// SIMD-ONLY01-NEXT:    [[CONV11:%.*]] = sitofp i32 [[TMP10]] to double
+// SIMD-ONLY01-NEXT:    [[TMP11:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[TMP11]], i64 0
+// SIMD-ONLY01-NEXT:    store double [[CONV11]], double* [[ARRAYIDX12]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP12:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[TMP12]], i64 0
+// SIMD-ONLY01-NEXT:    [[TMP13:%.*]] = load double, double* [[ARRAYIDX13]], align 8
+// SIMD-ONLY01-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
+// SIMD-ONLY01-NEXT:    store double [[INC]], double* [[ARRAYIDX13]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY01-NEXT:    call void @llvm.stackrestore(i8* [[TMP15]])
+// SIMD-ONLY01-NEXT:    ret i32 [[TMP14]]
+//
+//
+// SIMD-ONLY01-LABEL: define {{[^@]+}}@_Z3bariPd
+// SIMD-ONLY01-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY01-NEXT:  entry:
+// SIMD-ONLY01-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// SIMD-ONLY01-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// SIMD-ONLY01-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// SIMD-ONLY01-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// SIMD-ONLY01-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// SIMD-ONLY01-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// SIMD-ONLY01-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// SIMD-ONLY01-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// SIMD-ONLY01-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// SIMD-ONLY01-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// SIMD-ONLY01-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    ret i32 [[TMP9]]
+//
+//
+// SIMD-ONLY01-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// SIMD-ONLY01-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY01-NEXT:  entry:
+// SIMD-ONLY01-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// SIMD-ONLY01-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[B:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// SIMD-ONLY01-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// SIMD-ONLY01-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY01-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// SIMD-ONLY01-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY01-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// SIMD-ONLY01-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// SIMD-ONLY01-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY01-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP5]] to double
+// SIMD-ONLY01-NEXT:    [[ADD2:%.*]] = fadd double [[CONV]], 1.500000e+00
+// SIMD-ONLY01-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY01-NEXT:    store double [[ADD2]], double* [[A]], align 8
+// SIMD-ONLY01-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY01-NEXT:    [[TMP6:%.*]] = load double, double* [[A3]], align 8
+// SIMD-ONLY01-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// SIMD-ONLY01-NEXT:    store double [[INC]], double* [[A3]], align 8
+// SIMD-ONLY01-NEXT:    [[CONV4:%.*]] = fptosi double [[INC]] to i16
+// SIMD-ONLY01-NEXT:    [[TMP7:%.*]] = mul nsw i64 1, [[TMP2]]
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP7]]
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// SIMD-ONLY01-NEXT:    store i16 [[CONV4]], i16* [[ARRAYIDX5]], align 2
+// SIMD-ONLY01-NEXT:    [[TMP8:%.*]] = mul nsw i64 1, [[TMP2]]
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP8]]
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX6]], i64 1
+// SIMD-ONLY01-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+// SIMD-ONLY01-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP9]] to i32
+// SIMD-ONLY01-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], [[TMP10]]
+// SIMD-ONLY01-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY01-NEXT:    call void @llvm.stackrestore(i8* [[TMP11]])
+// SIMD-ONLY01-NEXT:    ret i32 [[ADD9]]
+//
+//
+// SIMD-ONLY01-LABEL: define {{[^@]+}}@_ZL7fstatici
+// SIMD-ONLY01-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY01-NEXT:  entry:
+// SIMD-ONLY01-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// SIMD-ONLY01-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY01-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    store i8 0, i8* [[AAA]], align 1
+// SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY01-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = load i8, i8* [[AAA]], align 1
+// SIMD-ONLY01-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+// SIMD-ONLY01-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY01-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i8
+// SIMD-ONLY01-NEXT:    store i8 [[CONV2]], i8* [[AAA]], align 1
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i64 0, i64 2
+// SIMD-ONLY01-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY01-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    ret i32 [[TMP3]]
+//
+//
+// SIMD-ONLY01-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// SIMD-ONLY01-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY01-NEXT:  entry:
+// SIMD-ONLY01-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY01-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY01-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY01-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY01-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i64 0, i64 2
+// SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY01-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY01-NEXT:    store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY01-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY01-NEXT:    ret i32 [[TMP2]]
+//
+//
+// SIMD-ONLY02-LABEL: define {{[^@]+}}@_Z3fooiPd
+// SIMD-ONLY02-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY02-NEXT:  entry:
+// SIMD-ONLY02-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// SIMD-ONLY02-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// SIMD-ONLY02-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// SIMD-ONLY02-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// SIMD-ONLY02-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// SIMD-ONLY02-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// SIMD-ONLY02-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// SIMD-ONLY02-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// SIMD-ONLY02-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    store i16 0, i16* [[AA]], align 2
+// SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY02-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY02-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// SIMD-ONLY02-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// SIMD-ONLY02-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// SIMD-ONLY02-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// SIMD-ONLY02-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY02-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// SIMD-ONLY02-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// SIMD-ONLY02-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// SIMD-ONLY02-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// SIMD-ONLY02-NEXT:    [[TMP6:%.*]] = load i16, i16* [[AA]], align 2
+// SIMD-ONLY02-NEXT:    [[CONV:%.*]] = sext i16 [[TMP6]] to i32
+// SIMD-ONLY02-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY02-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD]] to i16
+// SIMD-ONLY02-NEXT:    store i16 [[CONV2]], i16* [[AA]], align 2
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i32 0, i32 2
+// SIMD-ONLY02-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[VLA]], i32 3
+// SIMD-ONLY02-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX3]], align 4
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i32 0, i32 1
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX4]], i32 0, i32 2
+// SIMD-ONLY02-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX5]], align 8
+// SIMD-ONLY02-NEXT:    [[TMP7:%.*]] = mul nsw i32 1, [[TMP2]]
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[VLA1]], i32 [[TMP7]]
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX6]], i32 3
+// SIMD-ONLY02-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX7]], align 8
+// SIMD-ONLY02-NEXT:    [[X8:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 0
+// SIMD-ONLY02-NEXT:    store i64 1, i64* [[X8]], align 4
+// SIMD-ONLY02-NEXT:    [[Y9:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 1
+// SIMD-ONLY02-NEXT:    store i8 1, i8* [[Y9]], align 4
+// SIMD-ONLY02-NEXT:    [[X10:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY02-NEXT:    [[TMP8:%.*]] = load i32, i32* [[X10]], align 4
+// SIMD-ONLY02-NEXT:    [[CONV11:%.*]] = sitofp i32 [[TMP8]] to double
+// SIMD-ONLY02-NEXT:    [[TMP9:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[TMP9]], i32 0
+// SIMD-ONLY02-NEXT:    store double [[CONV11]], double* [[ARRAYIDX12]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP10:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
+// SIMD-ONLY02-NEXT:    [[TMP11:%.*]] = load double, double* [[ARRAYIDX13]], align 4
+// SIMD-ONLY02-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// SIMD-ONLY02-NEXT:    store double [[INC]], double* [[ARRAYIDX13]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP12:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY02-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// SIMD-ONLY02-NEXT:    ret i32 [[TMP12]]
+//
+//
+// SIMD-ONLY02-LABEL: define {{[^@]+}}@_Z3bariPd
+// SIMD-ONLY02-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY02-NEXT:  entry:
+// SIMD-ONLY02-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// SIMD-ONLY02-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// SIMD-ONLY02-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// SIMD-ONLY02-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// SIMD-ONLY02-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// SIMD-ONLY02-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// SIMD-ONLY02-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// SIMD-ONLY02-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// SIMD-ONLY02-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// SIMD-ONLY02-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// SIMD-ONLY02-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    ret i32 [[TMP9]]
+//
+//
+// SIMD-ONLY02-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// SIMD-ONLY02-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY02-NEXT:  entry:
+// SIMD-ONLY02-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// SIMD-ONLY02-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[B:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// SIMD-ONLY02-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY02-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY02-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// SIMD-ONLY02-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// SIMD-ONLY02-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY02-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to double
+// SIMD-ONLY02-NEXT:    [[ADD2:%.*]] = fadd double [[CONV]], 1.500000e+00
+// SIMD-ONLY02-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY02-NEXT:    store double [[ADD2]], double* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY02-NEXT:    [[TMP5:%.*]] = load double, double* [[A3]], align 4
+// SIMD-ONLY02-NEXT:    [[INC:%.*]] = fadd double [[TMP5]], 1.000000e+00
+// SIMD-ONLY02-NEXT:    store double [[INC]], double* [[A3]], align 4
+// SIMD-ONLY02-NEXT:    [[CONV4:%.*]] = fptosi double [[INC]] to i16
+// SIMD-ONLY02-NEXT:    [[TMP6:%.*]] = mul nsw i32 1, [[TMP1]]
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP6]]
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// SIMD-ONLY02-NEXT:    store i16 [[CONV4]], i16* [[ARRAYIDX5]], align 2
+// SIMD-ONLY02-NEXT:    [[TMP7:%.*]] = mul nsw i32 1, [[TMP1]]
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP7]]
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX6]], i32 1
+// SIMD-ONLY02-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+// SIMD-ONLY02-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP8]] to i32
+// SIMD-ONLY02-NEXT:    [[TMP9:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], [[TMP9]]
+// SIMD-ONLY02-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY02-NEXT:    call void @llvm.stackrestore(i8* [[TMP10]])
+// SIMD-ONLY02-NEXT:    ret i32 [[ADD9]]
+//
+//
+// SIMD-ONLY02-LABEL: define {{[^@]+}}@_ZL7fstatici
+// SIMD-ONLY02-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY02-NEXT:  entry:
+// SIMD-ONLY02-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// SIMD-ONLY02-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY02-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    store i8 0, i8* [[AAA]], align 1
+// SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY02-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = load i8, i8* [[AAA]], align 1
+// SIMD-ONLY02-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+// SIMD-ONLY02-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY02-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i8
+// SIMD-ONLY02-NEXT:    store i8 [[CONV2]], i8* [[AAA]], align 1
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i32 0, i32 2
+// SIMD-ONLY02-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY02-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    ret i32 [[TMP3]]
+//
+//
+// SIMD-ONLY02-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// SIMD-ONLY02-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY02-NEXT:  entry:
+// SIMD-ONLY02-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY02-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY02-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY02-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i32 0, i32 2
+// SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY02-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY02-NEXT:    store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY02-NEXT:    ret i32 [[TMP2]]
+//
+//
+// SIMD-ONLY03-LABEL: define {{[^@]+}}@_Z3fooiPd
+// SIMD-ONLY03-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY03-NEXT:  entry:
+// SIMD-ONLY03-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// SIMD-ONLY03-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// SIMD-ONLY03-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// SIMD-ONLY03-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// SIMD-ONLY03-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// SIMD-ONLY03-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// SIMD-ONLY03-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// SIMD-ONLY03-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// SIMD-ONLY03-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    store i16 0, i16* [[AA]], align 2
+// SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY03-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY03-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// SIMD-ONLY03-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// SIMD-ONLY03-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// SIMD-ONLY03-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// SIMD-ONLY03-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY03-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// SIMD-ONLY03-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// SIMD-ONLY03-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// SIMD-ONLY03-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// SIMD-ONLY03-NEXT:    [[TMP6:%.*]] = load i16, i16* [[AA]], align 2
+// SIMD-ONLY03-NEXT:    [[CONV:%.*]] = sext i16 [[TMP6]] to i32
+// SIMD-ONLY03-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY03-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD]] to i16
+// SIMD-ONLY03-NEXT:    store i16 [[CONV2]], i16* [[AA]], align 2
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i32 0, i32 2
+// SIMD-ONLY03-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[VLA]], i32 3
+// SIMD-ONLY03-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX3]], align 4
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i32 0, i32 1
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX4]], i32 0, i32 2
+// SIMD-ONLY03-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX5]], align 8
+// SIMD-ONLY03-NEXT:    [[TMP7:%.*]] = mul nsw i32 1, [[TMP2]]
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[VLA1]], i32 [[TMP7]]
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX6]], i32 3
+// SIMD-ONLY03-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX7]], align 8
+// SIMD-ONLY03-NEXT:    [[X8:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 0
+// SIMD-ONLY03-NEXT:    store i64 1, i64* [[X8]], align 4
+// SIMD-ONLY03-NEXT:    [[Y9:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 1
+// SIMD-ONLY03-NEXT:    store i8 1, i8* [[Y9]], align 4
+// SIMD-ONLY03-NEXT:    [[X10:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY03-NEXT:    [[TMP8:%.*]] = load i32, i32* [[X10]], align 4
+// SIMD-ONLY03-NEXT:    [[CONV11:%.*]] = sitofp i32 [[TMP8]] to double
+// SIMD-ONLY03-NEXT:    [[TMP9:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[TMP9]], i32 0
+// SIMD-ONLY03-NEXT:    store double [[CONV11]], double* [[ARRAYIDX12]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP10:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
+// SIMD-ONLY03-NEXT:    [[TMP11:%.*]] = load double, double* [[ARRAYIDX13]], align 4
+// SIMD-ONLY03-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// SIMD-ONLY03-NEXT:    store double [[INC]], double* [[ARRAYIDX13]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP12:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY03-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// SIMD-ONLY03-NEXT:    ret i32 [[TMP12]]
+//
+//
+// SIMD-ONLY03-LABEL: define {{[^@]+}}@_Z3bariPd
+// SIMD-ONLY03-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY03-NEXT:  entry:
+// SIMD-ONLY03-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// SIMD-ONLY03-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// SIMD-ONLY03-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// SIMD-ONLY03-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// SIMD-ONLY03-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// SIMD-ONLY03-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// SIMD-ONLY03-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// SIMD-ONLY03-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// SIMD-ONLY03-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// SIMD-ONLY03-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// SIMD-ONLY03-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    ret i32 [[TMP9]]
+//
+//
+// SIMD-ONLY03-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// SIMD-ONLY03-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY03-NEXT:  entry:
+// SIMD-ONLY03-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// SIMD-ONLY03-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[B:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// SIMD-ONLY03-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY03-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY03-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// SIMD-ONLY03-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// SIMD-ONLY03-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY03-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to double
+// SIMD-ONLY03-NEXT:    [[ADD2:%.*]] = fadd double [[CONV]], 1.500000e+00
+// SIMD-ONLY03-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY03-NEXT:    store double [[ADD2]], double* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY03-NEXT:    [[TMP5:%.*]] = load double, double* [[A3]], align 4
+// SIMD-ONLY03-NEXT:    [[INC:%.*]] = fadd double [[TMP5]], 1.000000e+00
+// SIMD-ONLY03-NEXT:    store double [[INC]], double* [[A3]], align 4
+// SIMD-ONLY03-NEXT:    [[CONV4:%.*]] = fptosi double [[INC]] to i16
+// SIMD-ONLY03-NEXT:    [[TMP6:%.*]] = mul nsw i32 1, [[TMP1]]
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP6]]
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// SIMD-ONLY03-NEXT:    store i16 [[CONV4]], i16* [[ARRAYIDX5]], align 2
+// SIMD-ONLY03-NEXT:    [[TMP7:%.*]] = mul nsw i32 1, [[TMP1]]
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP7]]
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX6]], i32 1
+// SIMD-ONLY03-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+// SIMD-ONLY03-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP8]] to i32
+// SIMD-ONLY03-NEXT:    [[TMP9:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], [[TMP9]]
+// SIMD-ONLY03-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY03-NEXT:    call void @llvm.stackrestore(i8* [[TMP10]])
+// SIMD-ONLY03-NEXT:    ret i32 [[ADD9]]
+//
+//
+// SIMD-ONLY03-LABEL: define {{[^@]+}}@_ZL7fstatici
+// SIMD-ONLY03-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY03-NEXT:  entry:
+// SIMD-ONLY03-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// SIMD-ONLY03-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY03-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    store i8 0, i8* [[AAA]], align 1
+// SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY03-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = load i8, i8* [[AAA]], align 1
+// SIMD-ONLY03-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+// SIMD-ONLY03-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY03-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i8
+// SIMD-ONLY03-NEXT:    store i8 [[CONV2]], i8* [[AAA]], align 1
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i32 0, i32 2
+// SIMD-ONLY03-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY03-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    ret i32 [[TMP3]]
+//
+//
+// SIMD-ONLY03-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// SIMD-ONLY03-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY03-NEXT:  entry:
+// SIMD-ONLY03-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY03-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY03-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY03-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i32 0, i32 2
+// SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY03-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY03-NEXT:    store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY03-NEXT:    ret i32 [[TMP2]]
+//
+//
+// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// TCHECK-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// TCHECK-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// TCHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// TCHECK-NEXT:    ret void
+//
+//
+// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// TCHECK-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// TCHECK-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// TCHECK-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// TCHECK-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// TCHECK-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// TCHECK-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// TCHECK-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// TCHECK-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// TCHECK-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// TCHECK-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// TCHECK-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// TCHECK-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// TCHECK-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// TCHECK-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// TCHECK-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// TCHECK-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// TCHECK-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// TCHECK-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// TCHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// TCHECK-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// TCHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// TCHECK-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// TCHECK-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// TCHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// TCHECK-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// TCHECK-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// TCHECK-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// TCHECK-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// TCHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// TCHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// TCHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// TCHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// TCHECK-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// TCHECK-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// TCHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// TCHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// TCHECK-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// TCHECK-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// TCHECK-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// TCHECK-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// TCHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// TCHECK-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// TCHECK-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// TCHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// TCHECK-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// TCHECK-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// TCHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// TCHECK-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// TCHECK-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// TCHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// TCHECK-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// TCHECK-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// TCHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// TCHECK-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// TCHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// TCHECK-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// TCHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// TCHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// TCHECK-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// TCHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// TCHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// TCHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// TCHECK-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// TCHECK-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// TCHECK-NEXT:    store i64 1, i64* [[X]], align 8
+// TCHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// TCHECK-NEXT:    store i8 1, i8* [[Y]], align 8
+// TCHECK-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// TCHECK-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// TCHECK-NEXT:    ret void
+//
+//
+// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// TCHECK-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// TCHECK-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// TCHECK-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// TCHECK-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// TCHECK-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0:%.*]], %struct.TT.0* [[TMP0]], i32 0, i32 0
+// TCHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X]], align 4
+// TCHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// TCHECK-NEXT:    [[TMP2:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// TCHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 0
+// TCHECK-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// TCHECK-NEXT:    [[TMP3:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// TCHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP3]], i64 0
+// TCHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX1]], align 8
+// TCHECK-NEXT:    [[INC:%.*]] = fadd double [[TMP4]], 1.000000e+00
+// TCHECK-NEXT:    store double [[INC]], double* [[ARRAYIDX1]], align 8
+// TCHECK-NEXT:    ret void
+//
+//
+// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// TCHECK-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// TCHECK-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// TCHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// TCHECK-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// TCHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// TCHECK-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// TCHECK-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// TCHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// TCHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// TCHECK-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// TCHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// TCHECK-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// TCHECK-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// TCHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// TCHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// TCHECK-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// TCHECK-NEXT:    ret void
+//
+//
+// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// TCHECK-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// TCHECK-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// TCHECK-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// TCHECK-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// TCHECK-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// TCHECK-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// TCHECK-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// TCHECK-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// TCHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// TCHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// TCHECK-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// TCHECK-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// TCHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// TCHECK-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// TCHECK-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// TCHECK-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// TCHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// TCHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// TCHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// TCHECK-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// TCHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// TCHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// TCHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// TCHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK-NEXT:    store double [[ADD]], double* [[A]], align 8
+// TCHECK-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// TCHECK-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// TCHECK-NEXT:    store double [[INC]], double* [[A5]], align 8
+// TCHECK-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// TCHECK-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// TCHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// TCHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// TCHECK-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// TCHECK-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// TCHECK-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// TCHECK-NEXT:    ret void
+//
+//
+// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// TCHECK-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// TCHECK-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// TCHECK-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// TCHECK-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// TCHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// TCHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// TCHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// TCHECK-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// TCHECK-NEXT:    ret void
+//
+//
+// TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// TCHECK1-SAME: (i64 noundef [[A:%.*]], i32* noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 8
+// TCHECK1-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK1-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 8
+// TCHECK1-NEXT:    store i64 [[GA]], i64* [[GA_ADDR]], align 8
+// TCHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[GA_ADDR]] to i32*
+// TCHECK1-NEXT:    ret void
+//
+//
+// TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// TCHECK1-SAME: (i64 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
+// TCHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
+// TCHECK1-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
+// TCHECK1-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
+// TCHECK1-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
+// TCHECK1-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// TCHECK1-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// TCHECK1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// TCHECK1-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// TCHECK1-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
+// TCHECK1-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
+// TCHECK1-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// TCHECK1-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
+// TCHECK1-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
+// TCHECK1-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// TCHECK1-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
+// TCHECK1-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
+// TCHECK1-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
+// TCHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
+// TCHECK1-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// TCHECK1-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
+// TCHECK1-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// TCHECK1-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// TCHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false)
+// TCHECK1-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// TCHECK1-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 8
+// TCHECK1-NEXT:    [[VLA6:%.*]] = alloca float, i64 [[TMP1]], align 4
+// TCHECK1-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// TCHECK1-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP1]], 4
+// TCHECK1-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// TCHECK1-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// TCHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i64 [[TMP11]], i1 false)
+// TCHECK1-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// TCHECK1-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// TCHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 400, i1 false)
+// TCHECK1-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// TCHECK1-NEXT:    [[VLA8:%.*]] = alloca double, i64 [[TMP16]], align 8
+// TCHECK1-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// TCHECK1-NEXT:    store i64 [[TMP5]], i64* [[__VLA_EXPR2]], align 8
+// TCHECK1-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP4]], [[TMP5]]
+// TCHECK1-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+// TCHECK1-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// TCHECK1-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// TCHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i64 [[TMP18]], i1 false)
+// TCHECK1-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// TCHECK1-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// TCHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP21]], i8* align 8 [[TMP22]], i64 16, i1 false)
+// TCHECK1-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// TCHECK1-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// TCHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// TCHECK1-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// TCHECK1-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// TCHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i64 0, i64 2
+// TCHECK1-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// TCHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i64 3
+// TCHECK1-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// TCHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i64 0, i64 1
+// TCHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i64 0, i64 2
+// TCHECK1-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// TCHECK1-NEXT:    [[TMP24:%.*]] = mul nsw i64 1, [[TMP5]]
+// TCHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i64 [[TMP24]]
+// TCHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i64 3
+// TCHECK1-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// TCHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// TCHECK1-NEXT:    store i64 1, i64* [[X]], align 8
+// TCHECK1-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// TCHECK1-NEXT:    store i8 1, i8* [[Y]], align 8
+// TCHECK1-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// TCHECK1-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// TCHECK1-NEXT:    ret void
+//
+//
+// TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// TCHECK1-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// TCHECK1-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 8
+// TCHECK1-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// TCHECK1-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 8
+// TCHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0:%.*]], %struct.TT.0* [[TMP0]], i32 0, i32 0
+// TCHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X]], align 4
+// TCHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// TCHECK1-NEXT:    [[TMP2:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// TCHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 0
+// TCHECK1-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 8
+// TCHECK1-NEXT:    [[TMP3:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// TCHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP3]], i64 0
+// TCHECK1-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX1]], align 8
+// TCHECK1-NEXT:    [[INC:%.*]] = fadd double [[TMP4]], 1.000000e+00
+// TCHECK1-NEXT:    store double [[INC]], double* [[ARRAYIDX1]], align 8
+// TCHECK1-NEXT:    ret void
+//
+//
+// TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// TCHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// TCHECK1-NEXT:    [[B2:%.*]] = alloca [10 x i32], align 4
+// TCHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK1-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
+// TCHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// TCHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
+// TCHECK1-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B2]] to i8*
+// TCHECK1-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// TCHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// TCHECK1-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV1]], align 1
+// TCHECK1-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP4]] to i32
+// TCHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
+// TCHECK1-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i8
+// TCHECK1-NEXT:    store i8 [[CONV5]], i8* [[CONV1]], align 1
+// TCHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B2]], i64 0, i64 2
+// TCHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP5]], 1
+// TCHECK1-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
+// TCHECK1-NEXT:    ret void
+//
+//
+// TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// TCHECK1-SAME: (%struct.S1* noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// TCHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
+// TCHECK1-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// TCHECK1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// TCHECK1-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
+// TCHECK1-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
+// TCHECK1-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
+// TCHECK1-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// TCHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
+// TCHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
+// TCHECK1-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// TCHECK1-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 8
+// TCHECK1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// TCHECK1-NEXT:    [[VLA3:%.*]] = alloca i16, i64 [[TMP5]], align 2
+// TCHECK1-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// TCHECK1-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR1]], align 8
+// TCHECK1-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+// TCHECK1-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
+// TCHECK1-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// TCHECK1-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// TCHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i64 [[TMP7]], i1 false)
+// TCHECK1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK1-NEXT:    [[CONV4:%.*]] = sitofp i32 [[TMP10]] to double
+// TCHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.500000e+00
+// TCHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK1-NEXT:    store double [[ADD]], double* [[A]], align 8
+// TCHECK1-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK1-NEXT:    [[TMP11:%.*]] = load double, double* [[A5]], align 8
+// TCHECK1-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// TCHECK1-NEXT:    store double [[INC]], double* [[A5]], align 8
+// TCHECK1-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
+// TCHECK1-NEXT:    [[TMP12:%.*]] = mul nsw i64 1, [[TMP2]]
+// TCHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i64 [[TMP12]]
+// TCHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// TCHECK1-NEXT:    store i16 [[CONV6]], i16* [[ARRAYIDX7]], align 2
+// TCHECK1-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// TCHECK1-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// TCHECK1-NEXT:    ret void
+//
+//
+// TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// TCHECK1-SAME: (i64 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
+// TCHECK1-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// TCHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
+// TCHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// TCHECK1-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
+// TCHECK1-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK1-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i64 40, i1 false)
+// TCHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
+// TCHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i64 0, i64 2
+// TCHECK1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// TCHECK1-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// TCHECK1-NEXT:    ret void
+//
+//
+// TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// TCHECK2-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// TCHECK2-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK2-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// TCHECK2-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// TCHECK2-NEXT:    ret void
+//
+//
+// TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// TCHECK2-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// TCHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// TCHECK2-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// TCHECK2-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// TCHECK2-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// TCHECK2-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// TCHECK2-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// TCHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// TCHECK2-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// TCHECK2-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// TCHECK2-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// TCHECK2-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// TCHECK2-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// TCHECK2-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// TCHECK2-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// TCHECK2-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// TCHECK2-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// TCHECK2-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// TCHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// TCHECK2-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// TCHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// TCHECK2-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// TCHECK2-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// TCHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// TCHECK2-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// TCHECK2-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// TCHECK2-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// TCHECK2-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// TCHECK2-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// TCHECK2-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// TCHECK2-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// TCHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// TCHECK2-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// TCHECK2-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// TCHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// TCHECK2-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// TCHECK2-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// TCHECK2-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// TCHECK2-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// TCHECK2-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// TCHECK2-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// TCHECK2-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// TCHECK2-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// TCHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// TCHECK2-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// TCHECK2-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// TCHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// TCHECK2-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// TCHECK2-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// TCHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// TCHECK2-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// TCHECK2-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// TCHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// TCHECK2-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// TCHECK2-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// TCHECK2-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// TCHECK2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// TCHECK2-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// TCHECK2-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// TCHECK2-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// TCHECK2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// TCHECK2-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// TCHECK2-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// TCHECK2-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// TCHECK2-NEXT:    store i64 1, i64* [[X]], align 4
+// TCHECK2-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// TCHECK2-NEXT:    store i8 1, i8* [[Y]], align 4
+// TCHECK2-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// TCHECK2-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// TCHECK2-NEXT:    ret void
+//
+//
+// TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// TCHECK2-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// TCHECK2-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// TCHECK2-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// TCHECK2-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// TCHECK2-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0:%.*]], %struct.TT.0* [[TMP0]], i32 0, i32 0
+// TCHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X]], align 4
+// TCHECK2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// TCHECK2-NEXT:    [[TMP2:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// TCHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
+// TCHECK2-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// TCHECK2-NEXT:    [[TMP3:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// TCHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
+// TCHECK2-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX1]], align 4
+// TCHECK2-NEXT:    [[INC:%.*]] = fadd double [[TMP4]], 1.000000e+00
+// TCHECK2-NEXT:    store double [[INC]], double* [[ARRAYIDX1]], align 4
+// TCHECK2-NEXT:    ret void
+//
+//
+// TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// TCHECK2-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// TCHECK2-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK2-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// TCHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// TCHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// TCHECK2-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK2-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// TCHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// TCHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// TCHECK2-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// TCHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// TCHECK2-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// TCHECK2-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// TCHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// TCHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK2-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// TCHECK2-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// TCHECK2-NEXT:    ret void
+//
+//
+// TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// TCHECK2-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// TCHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// TCHECK2-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// TCHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// TCHECK2-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// TCHECK2-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// TCHECK2-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// TCHECK2-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// TCHECK2-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// TCHECK2-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// TCHECK2-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// TCHECK2-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// TCHECK2-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// TCHECK2-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// TCHECK2-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// TCHECK2-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// TCHECK2-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// TCHECK2-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// TCHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// TCHECK2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// TCHECK2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// TCHECK2-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// TCHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK2-NEXT:    store double [[ADD]], double* [[A]], align 4
+// TCHECK2-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK2-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// TCHECK2-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// TCHECK2-NEXT:    store double [[INC]], double* [[A4]], align 4
+// TCHECK2-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// TCHECK2-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// TCHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// TCHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// TCHECK2-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// TCHECK2-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// TCHECK2-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// TCHECK2-NEXT:    ret void
+//
+//
+// TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// TCHECK2-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// TCHECK2-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// TCHECK2-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK2-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// TCHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// TCHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// TCHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// TCHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK2-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// TCHECK2-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// TCHECK2-NEXT:    ret void
+//
+//
+// TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
+// TCHECK3-SAME: (i32 noundef [[A:%.*]], i32* noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[P_ADDR:%.*]] = alloca i32*, align 4
+// TCHECK3-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK3-NEXT:    store i32* [[P]], i32** [[P_ADDR]], align 4
+// TCHECK3-NEXT:    store i32 [[GA]], i32* [[GA_ADDR]], align 4
+// TCHECK3-NEXT:    ret void
+//
+//
+// TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
+// TCHECK3-SAME: (i32 noundef [[AA:%.*]], [10 x float]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], double* noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], %struct.TT* noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
+// TCHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
+// TCHECK3-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
+// TCHECK3-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
+// TCHECK3-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
+// TCHECK3-NEXT:    [[B5:%.*]] = alloca [10 x float], align 4
+// TCHECK3-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// TCHECK3-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[C7:%.*]] = alloca [5 x [10 x double]], align 8
+// TCHECK3-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// TCHECK3-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
+// TCHECK3-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
+// TCHECK3-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// TCHECK3-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
+// TCHECK3-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
+// TCHECK3-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// TCHECK3-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
+// TCHECK3-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
+// TCHECK3-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
+// TCHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
+// TCHECK3-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// TCHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
+// TCHECK3-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP8:%.*]] = bitcast [10 x float]* [[B5]] to i8*
+// TCHECK3-NEXT:    [[TMP9:%.*]] = bitcast [10 x float]* [[TMP0]] to i8*
+// TCHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false)
+// TCHECK3-NEXT:    [[TMP10:%.*]] = call i8* @llvm.stacksave()
+// TCHECK3-NEXT:    store i8* [[TMP10]], i8** [[SAVED_STACK]], align 4
+// TCHECK3-NEXT:    [[VLA6:%.*]] = alloca float, i32 [[TMP1]], align 4
+// TCHECK3-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// TCHECK3-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
+// TCHECK3-NEXT:    [[TMP12:%.*]] = bitcast float* [[VLA6]] to i8*
+// TCHECK3-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP2]] to i8*
+// TCHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP12]], i8* align 4 [[TMP13]], i32 [[TMP11]], i1 false)
+// TCHECK3-NEXT:    [[TMP14:%.*]] = bitcast [5 x [10 x double]]* [[C7]] to i8*
+// TCHECK3-NEXT:    [[TMP15:%.*]] = bitcast [5 x [10 x double]]* [[TMP3]] to i8*
+// TCHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i32 400, i1 false)
+// TCHECK3-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// TCHECK3-NEXT:    [[VLA8:%.*]] = alloca double, i32 [[TMP16]], align 8
+// TCHECK3-NEXT:    store i32 [[TMP4]], i32* [[__VLA_EXPR1]], align 4
+// TCHECK3-NEXT:    store i32 [[TMP5]], i32* [[__VLA_EXPR2]], align 4
+// TCHECK3-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP4]], [[TMP5]]
+// TCHECK3-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+// TCHECK3-NEXT:    [[TMP19:%.*]] = bitcast double* [[VLA8]] to i8*
+// TCHECK3-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP6]] to i8*
+// TCHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP19]], i8* align 8 [[TMP20]], i32 [[TMP18]], i1 false)
+// TCHECK3-NEXT:    [[TMP21:%.*]] = bitcast %struct.TT* [[D9]] to i8*
+// TCHECK3-NEXT:    [[TMP22:%.*]] = bitcast %struct.TT* [[TMP7]] to i8*
+// TCHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP21]], i8* align 4 [[TMP22]], i32 12, i1 false)
+// TCHECK3-NEXT:    [[TMP23:%.*]] = load i16, i16* [[CONV]], align 2
+// TCHECK3-NEXT:    [[CONV10:%.*]] = sext i16 [[TMP23]] to i32
+// TCHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV10]], 1
+// TCHECK3-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD]] to i16
+// TCHECK3-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 2
+// TCHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B5]], i32 0, i32 2
+// TCHECK3-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// TCHECK3-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[VLA6]], i32 3
+// TCHECK3-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX12]], align 4
+// TCHECK3-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C7]], i32 0, i32 1
+// TCHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX13]], i32 0, i32 2
+// TCHECK3-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX14]], align 8
+// TCHECK3-NEXT:    [[TMP24:%.*]] = mul nsw i32 1, [[TMP5]]
+// TCHECK3-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[VLA8]], i32 [[TMP24]]
+// TCHECK3-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX15]], i32 3
+// TCHECK3-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX16]], align 8
+// TCHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 0
+// TCHECK3-NEXT:    store i64 1, i64* [[X]], align 4
+// TCHECK3-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D9]], i32 0, i32 1
+// TCHECK3-NEXT:    store i8 1, i8* [[Y]], align 4
+// TCHECK3-NEXT:    [[TMP25:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// TCHECK3-NEXT:    call void @llvm.stackrestore(i8* [[TMP25]])
+// TCHECK3-NEXT:    ret void
+//
+//
+// TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
+// TCHECK3-SAME: (double* noundef [[PTR:%.*]], %struct.TT.0* noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// TCHECK3-NEXT:    [[E_ADDR:%.*]] = alloca %struct.TT.0*, align 4
+// TCHECK3-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// TCHECK3-NEXT:    store %struct.TT.0* [[E]], %struct.TT.0** [[E_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP0:%.*]] = load %struct.TT.0*, %struct.TT.0** [[E_ADDR]], align 4
+// TCHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0:%.*]], %struct.TT.0* [[TMP0]], i32 0, i32 0
+// TCHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[X]], align 4
+// TCHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// TCHECK3-NEXT:    [[TMP2:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// TCHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
+// TCHECK3-NEXT:    store double [[CONV]], double* [[ARRAYIDX]], align 4
+// TCHECK3-NEXT:    [[TMP3:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// TCHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
+// TCHECK3-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX1]], align 4
+// TCHECK3-NEXT:    [[INC:%.*]] = fadd double [[TMP4]], 1.000000e+00
+// TCHECK3-NEXT:    store double [[INC]], double* [[ARRAYIDX1]], align 4
+// TCHECK3-NEXT:    ret void
+//
+//
+// TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
+// TCHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// TCHECK3-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK3-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
+// TCHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// TCHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
+// TCHECK3-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK3-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// TCHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// TCHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CONV]], align 1
+// TCHECK3-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP4]] to i32
+// TCHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV2]], 1
+// TCHECK3-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i8
+// TCHECK3-NEXT:    store i8 [[CONV4]], i8* [[CONV]], align 1
+// TCHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// TCHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP5]], 1
+// TCHECK3-NEXT:    store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4
+// TCHECK3-NEXT:    ret void
+//
+//
+// TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
+// TCHECK3-SAME: (%struct.S1* noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// TCHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
+// TCHECK3-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// TCHECK3-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// TCHECK3-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
+// TCHECK3-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
+// TCHECK3-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
+// TCHECK3-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
+// TCHECK3-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP4:%.*]] = call i8* @llvm.stacksave()
+// TCHECK3-NEXT:    store i8* [[TMP4]], i8** [[SAVED_STACK]], align 4
+// TCHECK3-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// TCHECK3-NEXT:    [[VLA3:%.*]] = alloca i16, i32 [[TMP5]], align 2
+// TCHECK3-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// TCHECK3-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// TCHECK3-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
+// TCHECK3-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
+// TCHECK3-NEXT:    [[TMP8:%.*]] = bitcast i16* [[VLA3]] to i8*
+// TCHECK3-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP3]] to i8*
+// TCHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP8]], i8* align 2 [[TMP9]], i32 [[TMP7]], i1 false)
+// TCHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// TCHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP10]] to double
+// TCHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// TCHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK3-NEXT:    store double [[ADD]], double* [[A]], align 4
+// TCHECK3-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
+// TCHECK3-NEXT:    [[TMP11:%.*]] = load double, double* [[A4]], align 4
+// TCHECK3-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// TCHECK3-NEXT:    store double [[INC]], double* [[A4]], align 4
+// TCHECK3-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
+// TCHECK3-NEXT:    [[TMP12:%.*]] = mul nsw i32 1, [[TMP2]]
+// TCHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA3]], i32 [[TMP12]]
+// TCHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// TCHECK3-NEXT:    store i16 [[CONV5]], i16* [[ARRAYIDX6]], align 2
+// TCHECK3-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// TCHECK3-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// TCHECK3-NEXT:    ret void
+//
+//
+// TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
+// TCHECK3-SAME: (i32 noundef [[A:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
+// TCHECK3-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// TCHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
+// TCHECK3-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[B1]] to i8*
+// TCHECK3-NEXT:    [[TMP2:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
+// TCHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP1]], i8* align 4 [[TMP2]], i32 40, i1 false)
+// TCHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// TCHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+// TCHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+// TCHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B1]], i32 0, i32 2
+// TCHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// TCHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], 1
+// TCHECK3-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX]], align 4
+// TCHECK3-NEXT:    ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_Z3fooiPd
+// SIMD-ONLY1-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// SIMD-ONLY1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// SIMD-ONLY1-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// SIMD-ONLY1-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// SIMD-ONLY1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// SIMD-ONLY1-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// SIMD-ONLY1-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// SIMD-ONLY1-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// SIMD-ONLY1-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// SIMD-ONLY1-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// SIMD-ONLY1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    store i16 0, i16* [[AA]], align 2
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY1-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY1-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// SIMD-ONLY1-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// SIMD-ONLY1-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// SIMD-ONLY1-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// SIMD-ONLY1-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// SIMD-ONLY1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// SIMD-ONLY1-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// SIMD-ONLY1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// SIMD-ONLY1-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// SIMD-ONLY1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[AA]], align 2
+// SIMD-ONLY1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY1-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD]] to i16
+// SIMD-ONLY1-NEXT:    store i16 [[CONV2]], i16* [[AA]], align 2
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i64 0, i64 2
+// SIMD-ONLY1-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[VLA]], i64 3
+// SIMD-ONLY1-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX3]], align 4
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i64 0, i64 1
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX4]], i64 0, i64 2
+// SIMD-ONLY1-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX5]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP9:%.*]] = mul nsw i64 1, [[TMP4]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[VLA1]], i64 [[TMP9]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX6]], i64 3
+// SIMD-ONLY1-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX7]], align 8
+// SIMD-ONLY1-NEXT:    [[X8:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    store i64 1, i64* [[X8]], align 8
+// SIMD-ONLY1-NEXT:    [[Y9:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 1
+// SIMD-ONLY1-NEXT:    store i8 1, i8* [[Y9]], align 8
+// SIMD-ONLY1-NEXT:    [[X10:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[X10]], align 4
+// SIMD-ONLY1-NEXT:    [[CONV11:%.*]] = sitofp i32 [[TMP10]] to double
+// SIMD-ONLY1-NEXT:    [[TMP11:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[TMP11]], i64 0
+// SIMD-ONLY1-NEXT:    store double [[CONV11]], double* [[ARRAYIDX12]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP12:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[TMP12]], i64 0
+// SIMD-ONLY1-NEXT:    [[TMP13:%.*]] = load double, double* [[ARRAYIDX13]], align 8
+// SIMD-ONLY1-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
+// SIMD-ONLY1-NEXT:    store double [[INC]], double* [[ARRAYIDX13]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY1-NEXT:    call void @llvm.stackrestore(i8* [[TMP15]])
+// SIMD-ONLY1-NEXT:    ret i32 [[TMP14]]
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_Z3bariPd
+// SIMD-ONLY1-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// SIMD-ONLY1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// SIMD-ONLY1-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// SIMD-ONLY1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// SIMD-ONLY1-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// SIMD-ONLY1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// SIMD-ONLY1-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// SIMD-ONLY1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// SIMD-ONLY1-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    ret i32 [[TMP9]]
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// SIMD-ONLY1-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// SIMD-ONLY1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[B:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// SIMD-ONLY1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// SIMD-ONLY1-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// SIMD-ONLY1-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY1-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// SIMD-ONLY1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// SIMD-ONLY1-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// SIMD-ONLY1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP5]] to double
+// SIMD-ONLY1-NEXT:    [[ADD2:%.*]] = fadd double [[CONV]], 1.500000e+00
+// SIMD-ONLY1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    store double [[ADD2]], double* [[A]], align 8
+// SIMD-ONLY1-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY1-NEXT:    [[TMP6:%.*]] = load double, double* [[A3]], align 8
+// SIMD-ONLY1-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// SIMD-ONLY1-NEXT:    store double [[INC]], double* [[A3]], align 8
+// SIMD-ONLY1-NEXT:    [[CONV4:%.*]] = fptosi double [[INC]] to i16
+// SIMD-ONLY1-NEXT:    [[TMP7:%.*]] = mul nsw i64 1, [[TMP2]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP7]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// SIMD-ONLY1-NEXT:    store i16 [[CONV4]], i16* [[ARRAYIDX5]], align 2
+// SIMD-ONLY1-NEXT:    [[TMP8:%.*]] = mul nsw i64 1, [[TMP2]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP8]]
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX6]], i64 1
+// SIMD-ONLY1-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+// SIMD-ONLY1-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP9]] to i32
+// SIMD-ONLY1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], [[TMP10]]
+// SIMD-ONLY1-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY1-NEXT:    call void @llvm.stackrestore(i8* [[TMP11]])
+// SIMD-ONLY1-NEXT:    ret i32 [[ADD9]]
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_ZL7fstatici
+// SIMD-ONLY1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// SIMD-ONLY1-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    store i8 0, i8* [[AAA]], align 1
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i8, i8* [[AAA]], align 1
+// SIMD-ONLY1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+// SIMD-ONLY1-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY1-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i8
+// SIMD-ONLY1-NEXT:    store i8 [[CONV2]], i8* [[AAA]], align 1
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i64 0, i64 2
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    ret i32 [[TMP3]]
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// SIMD-ONLY1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY1-NEXT:  entry:
+// SIMD-ONLY1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY1-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY1-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i64 0, i64 2
+// SIMD-ONLY1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY1-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY1-NEXT:    store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY1-NEXT:    ret i32 [[TMP2]]
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_Z3fooiPd
+// SIMD-ONLY11-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// SIMD-ONLY11-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// SIMD-ONLY11-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// SIMD-ONLY11-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// SIMD-ONLY11-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// SIMD-ONLY11-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// SIMD-ONLY11-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// SIMD-ONLY11-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// SIMD-ONLY11-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// SIMD-ONLY11-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// SIMD-ONLY11-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    store i16 0, i16* [[AA]], align 2
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// SIMD-ONLY11-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY11-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY11-NEXT:    [[VLA:%.*]] = alloca float, i64 [[TMP1]], align 4
+// SIMD-ONLY11-NEXT:    store i64 [[TMP1]], i64* [[__VLA_EXPR0]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+// SIMD-ONLY11-NEXT:    [[TMP5:%.*]] = mul nuw i64 5, [[TMP4]]
+// SIMD-ONLY11-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP5]], align 8
+// SIMD-ONLY11-NEXT:    store i64 [[TMP4]], i64* [[__VLA_EXPR1]], align 8
+// SIMD-ONLY11-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY11-NEXT:    [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    store i32 [[TMP6]], i32* [[X]], align 4
+// SIMD-ONLY11-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// SIMD-ONLY11-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    store i32 [[TMP7]], i32* [[Y]], align 4
+// SIMD-ONLY11-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// SIMD-ONLY11-NEXT:    [[TMP8:%.*]] = load i16, i16* [[AA]], align 2
+// SIMD-ONLY11-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
+// SIMD-ONLY11-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY11-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD]] to i16
+// SIMD-ONLY11-NEXT:    store i16 [[CONV2]], i16* [[AA]], align 2
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i64 0, i64 2
+// SIMD-ONLY11-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[VLA]], i64 3
+// SIMD-ONLY11-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX3]], align 4
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i64 0, i64 1
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX4]], i64 0, i64 2
+// SIMD-ONLY11-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX5]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP9:%.*]] = mul nsw i64 1, [[TMP4]]
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[VLA1]], i64 [[TMP9]]
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX6]], i64 3
+// SIMD-ONLY11-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX7]], align 8
+// SIMD-ONLY11-NEXT:    [[X8:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 0
+// SIMD-ONLY11-NEXT:    store i64 1, i64* [[X8]], align 8
+// SIMD-ONLY11-NEXT:    [[Y9:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 1
+// SIMD-ONLY11-NEXT:    store i8 1, i8* [[Y9]], align 8
+// SIMD-ONLY11-NEXT:    [[X10:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY11-NEXT:    [[TMP10:%.*]] = load i32, i32* [[X10]], align 4
+// SIMD-ONLY11-NEXT:    [[CONV11:%.*]] = sitofp i32 [[TMP10]] to double
+// SIMD-ONLY11-NEXT:    [[TMP11:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[TMP11]], i64 0
+// SIMD-ONLY11-NEXT:    store double [[CONV11]], double* [[ARRAYIDX12]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP12:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[TMP12]], i64 0
+// SIMD-ONLY11-NEXT:    [[TMP13:%.*]] = load double, double* [[ARRAYIDX13]], align 8
+// SIMD-ONLY11-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
+// SIMD-ONLY11-NEXT:    store double [[INC]], double* [[ARRAYIDX13]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP14:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY11-NEXT:    call void @llvm.stackrestore(i8* [[TMP15]])
+// SIMD-ONLY11-NEXT:    ret i32 [[TMP14]]
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_Z3bariPd
+// SIMD-ONLY11-SAME: (i32 noundef signext [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// SIMD-ONLY11-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// SIMD-ONLY11-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z3fooiPd(i32 noundef signext [[TMP0]], double* noundef [[TMP1]])
+// SIMD-ONLY11-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// SIMD-ONLY11-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP3]])
+// SIMD-ONLY11-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// SIMD-ONLY11-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP5]])
+// SIMD-ONLY11-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// SIMD-ONLY11-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[CALL5:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP7]])
+// SIMD-ONLY11-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// SIMD-ONLY11-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    ret i32 [[TMP9]]
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// SIMD-ONLY11-SAME: (%struct.S1* noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
+// SIMD-ONLY11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[B:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// SIMD-ONLY11-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// SIMD-ONLY11-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY11-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// SIMD-ONLY11-NEXT:    [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY11-NEXT:    store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP4:%.*]] = mul nuw i64 2, [[TMP2]]
+// SIMD-ONLY11-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP4]], align 2
+// SIMD-ONLY11-NEXT:    store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP5:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY11-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP5]] to double
+// SIMD-ONLY11-NEXT:    [[ADD2:%.*]] = fadd double [[CONV]], 1.500000e+00
+// SIMD-ONLY11-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY11-NEXT:    store double [[ADD2]], double* [[A]], align 8
+// SIMD-ONLY11-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY11-NEXT:    [[TMP6:%.*]] = load double, double* [[A3]], align 8
+// SIMD-ONLY11-NEXT:    [[INC:%.*]] = fadd double [[TMP6]], 1.000000e+00
+// SIMD-ONLY11-NEXT:    store double [[INC]], double* [[A3]], align 8
+// SIMD-ONLY11-NEXT:    [[CONV4:%.*]] = fptosi double [[INC]] to i16
+// SIMD-ONLY11-NEXT:    [[TMP7:%.*]] = mul nsw i64 1, [[TMP2]]
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP7]]
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
+// SIMD-ONLY11-NEXT:    store i16 [[CONV4]], i16* [[ARRAYIDX5]], align 2
+// SIMD-ONLY11-NEXT:    [[TMP8:%.*]] = mul nsw i64 1, [[TMP2]]
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[TMP8]]
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX6]], i64 1
+// SIMD-ONLY11-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+// SIMD-ONLY11-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP9]] to i32
+// SIMD-ONLY11-NEXT:    [[TMP10:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], [[TMP10]]
+// SIMD-ONLY11-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// SIMD-ONLY11-NEXT:    call void @llvm.stackrestore(i8* [[TMP11]])
+// SIMD-ONLY11-NEXT:    ret i32 [[ADD9]]
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_ZL7fstatici
+// SIMD-ONLY11-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// SIMD-ONLY11-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY11-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    store i8 0, i8* [[AAA]], align 1
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY11-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP1:%.*]] = load i8, i8* [[AAA]], align 1
+// SIMD-ONLY11-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+// SIMD-ONLY11-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY11-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i8
+// SIMD-ONLY11-NEXT:    store i8 [[CONV2]], i8* [[AAA]], align 1
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i64 0, i64 2
+// SIMD-ONLY11-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY11-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    ret i32 [[TMP3]]
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// SIMD-ONLY11-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY11-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY11-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY11-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY11-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i64 0, i64 2
+// SIMD-ONLY11-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY11-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY11-NEXT:    store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY11-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY11-NEXT:    ret i32 [[TMP2]]
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_Z3fooiPd
+// SIMD-ONLY12-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// SIMD-ONLY12-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// SIMD-ONLY12-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// SIMD-ONLY12-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// SIMD-ONLY12-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// SIMD-ONLY12-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// SIMD-ONLY12-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// SIMD-ONLY12-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// SIMD-ONLY12-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    store i16 0, i16* [[AA]], align 2
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY12-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY12-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// SIMD-ONLY12-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// SIMD-ONLY12-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// SIMD-ONLY12-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// SIMD-ONLY12-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY12-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// SIMD-ONLY12-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// SIMD-ONLY12-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// SIMD-ONLY12-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// SIMD-ONLY12-NEXT:    [[TMP6:%.*]] = load i16, i16* [[AA]], align 2
+// SIMD-ONLY12-NEXT:    [[CONV:%.*]] = sext i16 [[TMP6]] to i32
+// SIMD-ONLY12-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY12-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD]] to i16
+// SIMD-ONLY12-NEXT:    store i16 [[CONV2]], i16* [[AA]], align 2
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i32 0, i32 2
+// SIMD-ONLY12-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[VLA]], i32 3
+// SIMD-ONLY12-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX3]], align 4
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i32 0, i32 1
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX4]], i32 0, i32 2
+// SIMD-ONLY12-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX5]], align 8
+// SIMD-ONLY12-NEXT:    [[TMP7:%.*]] = mul nsw i32 1, [[TMP2]]
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[VLA1]], i32 [[TMP7]]
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX6]], i32 3
+// SIMD-ONLY12-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX7]], align 8
+// SIMD-ONLY12-NEXT:    [[X8:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 0
+// SIMD-ONLY12-NEXT:    store i64 1, i64* [[X8]], align 4
+// SIMD-ONLY12-NEXT:    [[Y9:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 1
+// SIMD-ONLY12-NEXT:    store i8 1, i8* [[Y9]], align 4
+// SIMD-ONLY12-NEXT:    [[X10:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY12-NEXT:    [[TMP8:%.*]] = load i32, i32* [[X10]], align 4
+// SIMD-ONLY12-NEXT:    [[CONV11:%.*]] = sitofp i32 [[TMP8]] to double
+// SIMD-ONLY12-NEXT:    [[TMP9:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[TMP9]], i32 0
+// SIMD-ONLY12-NEXT:    store double [[CONV11]], double* [[ARRAYIDX12]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP10:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
+// SIMD-ONLY12-NEXT:    [[TMP11:%.*]] = load double, double* [[ARRAYIDX13]], align 4
+// SIMD-ONLY12-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// SIMD-ONLY12-NEXT:    store double [[INC]], double* [[ARRAYIDX13]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP12:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY12-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// SIMD-ONLY12-NEXT:    ret i32 [[TMP12]]
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_Z3bariPd
+// SIMD-ONLY12-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// SIMD-ONLY12-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// SIMD-ONLY12-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// SIMD-ONLY12-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// SIMD-ONLY12-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// SIMD-ONLY12-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// SIMD-ONLY12-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// SIMD-ONLY12-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// SIMD-ONLY12-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// SIMD-ONLY12-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// SIMD-ONLY12-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    ret i32 [[TMP9]]
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// SIMD-ONLY12-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// SIMD-ONLY12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[B:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// SIMD-ONLY12-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY12-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY12-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// SIMD-ONLY12-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// SIMD-ONLY12-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY12-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to double
+// SIMD-ONLY12-NEXT:    [[ADD2:%.*]] = fadd double [[CONV]], 1.500000e+00
+// SIMD-ONLY12-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY12-NEXT:    store double [[ADD2]], double* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY12-NEXT:    [[TMP5:%.*]] = load double, double* [[A3]], align 4
+// SIMD-ONLY12-NEXT:    [[INC:%.*]] = fadd double [[TMP5]], 1.000000e+00
+// SIMD-ONLY12-NEXT:    store double [[INC]], double* [[A3]], align 4
+// SIMD-ONLY12-NEXT:    [[CONV4:%.*]] = fptosi double [[INC]] to i16
+// SIMD-ONLY12-NEXT:    [[TMP6:%.*]] = mul nsw i32 1, [[TMP1]]
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP6]]
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// SIMD-ONLY12-NEXT:    store i16 [[CONV4]], i16* [[ARRAYIDX5]], align 2
+// SIMD-ONLY12-NEXT:    [[TMP7:%.*]] = mul nsw i32 1, [[TMP1]]
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP7]]
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX6]], i32 1
+// SIMD-ONLY12-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+// SIMD-ONLY12-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP8]] to i32
+// SIMD-ONLY12-NEXT:    [[TMP9:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], [[TMP9]]
+// SIMD-ONLY12-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY12-NEXT:    call void @llvm.stackrestore(i8* [[TMP10]])
+// SIMD-ONLY12-NEXT:    ret i32 [[ADD9]]
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_ZL7fstatici
+// SIMD-ONLY12-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// SIMD-ONLY12-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY12-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    store i8 0, i8* [[AAA]], align 1
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY12-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP1:%.*]] = load i8, i8* [[AAA]], align 1
+// SIMD-ONLY12-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+// SIMD-ONLY12-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY12-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i8
+// SIMD-ONLY12-NEXT:    store i8 [[CONV2]], i8* [[AAA]], align 1
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i32 0, i32 2
+// SIMD-ONLY12-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY12-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    ret i32 [[TMP3]]
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// SIMD-ONLY12-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY12-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY12-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY12-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i32 0, i32 2
+// SIMD-ONLY12-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY12-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY12-NEXT:    store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY12-NEXT:    ret i32 [[TMP2]]
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_Z3fooiPd
+// SIMD-ONLY13-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// SIMD-ONLY13-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[AA:%.*]] = alloca i16, align 2
+// SIMD-ONLY13-NEXT:    [[B:%.*]] = alloca [10 x float], align 4
+// SIMD-ONLY13-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// SIMD-ONLY13-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// SIMD-ONLY13-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// SIMD-ONLY13-NEXT:    [[E:%.*]] = alloca [[STRUCT_TT_0:%.*]], align 4
+// SIMD-ONLY13-NEXT:    [[P:%.*]] = alloca i32*, align 64
+// SIMD-ONLY13-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    store i16 0, i16* [[AA]], align 2
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP1:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY13-NEXT:    store i8* [[TMP1]], i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY13-NEXT:    [[VLA:%.*]] = alloca float, i32 [[TMP0]], align 4
+// SIMD-ONLY13-NEXT:    store i32 [[TMP0]], i32* [[__VLA_EXPR0]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP3:%.*]] = mul nuw i32 5, [[TMP2]]
+// SIMD-ONLY13-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP3]], align 8
+// SIMD-ONLY13-NEXT:    store i32 [[TMP2]], i32* [[__VLA_EXPR1]], align 4
+// SIMD-ONLY13-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY13-NEXT:    [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store i32 [[TMP4]], i32* [[X]], align 4
+// SIMD-ONLY13-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 1
+// SIMD-ONLY13-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store i32 [[TMP5]], i32* [[Y]], align 4
+// SIMD-ONLY13-NEXT:    store i32* [[A]], i32** [[P]], align 64
+// SIMD-ONLY13-NEXT:    [[TMP6:%.*]] = load i16, i16* [[AA]], align 2
+// SIMD-ONLY13-NEXT:    [[CONV:%.*]] = sext i16 [[TMP6]] to i32
+// SIMD-ONLY13-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY13-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD]] to i16
+// SIMD-ONLY13-NEXT:    store i16 [[CONV2]], i16* [[AA]], align 2
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i32 0, i32 2
+// SIMD-ONLY13-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[VLA]], i32 3
+// SIMD-ONLY13-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX3]], align 4
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i32 0, i32 1
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX4]], i32 0, i32 2
+// SIMD-ONLY13-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX5]], align 8
+// SIMD-ONLY13-NEXT:    [[TMP7:%.*]] = mul nsw i32 1, [[TMP2]]
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[VLA1]], i32 [[TMP7]]
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX6]], i32 3
+// SIMD-ONLY13-NEXT:    store double 1.000000e+00, double* [[ARRAYIDX7]], align 8
+// SIMD-ONLY13-NEXT:    [[X8:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 0
+// SIMD-ONLY13-NEXT:    store i64 1, i64* [[X8]], align 4
+// SIMD-ONLY13-NEXT:    [[Y9:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[D]], i32 0, i32 1
+// SIMD-ONLY13-NEXT:    store i8 1, i8* [[Y9]], align 4
+// SIMD-ONLY13-NEXT:    [[X10:%.*]] = getelementptr inbounds [[STRUCT_TT_0]], %struct.TT.0* [[E]], i32 0, i32 0
+// SIMD-ONLY13-NEXT:    [[TMP8:%.*]] = load i32, i32* [[X10]], align 4
+// SIMD-ONLY13-NEXT:    [[CONV11:%.*]] = sitofp i32 [[TMP8]] to double
+// SIMD-ONLY13-NEXT:    [[TMP9:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[TMP9]], i32 0
+// SIMD-ONLY13-NEXT:    store double [[CONV11]], double* [[ARRAYIDX12]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP10:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
+// SIMD-ONLY13-NEXT:    [[TMP11:%.*]] = load double, double* [[ARRAYIDX13]], align 4
+// SIMD-ONLY13-NEXT:    [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00
+// SIMD-ONLY13-NEXT:    store double [[INC]], double* [[ARRAYIDX13]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP12:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY13-NEXT:    call void @llvm.stackrestore(i8* [[TMP13]])
+// SIMD-ONLY13-NEXT:    ret i32 [[TMP12]]
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_Z3bariPd
+// SIMD-ONLY13-SAME: (i32 noundef [[N:%.*]], double* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 4
+// SIMD-ONLY13-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// SIMD-ONLY13-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store double* [[PTR]], double** [[PTR_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP1:%.*]] = load double*, double** [[PTR_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z3fooiPd(i32 noundef [[TMP0]], double* noundef [[TMP1]])
+// SIMD-ONLY13-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]]
+// SIMD-ONLY13-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN2S12r1Ei(%struct.S1* noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP3]])
+// SIMD-ONLY13-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP4]], [[CALL1]]
+// SIMD-ONLY13-NEXT:    store i32 [[ADD2]], i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP5:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP5]])
+// SIMD-ONLY13-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP6]], [[CALL3]]
+// SIMD-ONLY13-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[CALL5:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP7]])
+// SIMD-ONLY13-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], [[CALL5]]
+// SIMD-ONLY13-NEXT:    store i32 [[ADD6]], i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    ret i32 [[TMP9]]
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// SIMD-ONLY13-SAME: (%struct.S1* noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
+// SIMD-ONLY13-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[B:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// SIMD-ONLY13-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[THIS1:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY13-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// SIMD-ONLY13-NEXT:    store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP3:%.*]] = mul nuw i32 2, [[TMP1]]
+// SIMD-ONLY13-NEXT:    [[VLA:%.*]] = alloca i16, i32 [[TMP3]], align 2
+// SIMD-ONLY13-NEXT:    store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP4:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY13-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to double
+// SIMD-ONLY13-NEXT:    [[ADD2:%.*]] = fadd double [[CONV]], 1.500000e+00
+// SIMD-ONLY13-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY13-NEXT:    store double [[ADD2]], double* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[THIS1]], i32 0, i32 0
+// SIMD-ONLY13-NEXT:    [[TMP5:%.*]] = load double, double* [[A3]], align 4
+// SIMD-ONLY13-NEXT:    [[INC:%.*]] = fadd double [[TMP5]], 1.000000e+00
+// SIMD-ONLY13-NEXT:    store double [[INC]], double* [[A3]], align 4
+// SIMD-ONLY13-NEXT:    [[CONV4:%.*]] = fptosi double [[INC]] to i16
+// SIMD-ONLY13-NEXT:    [[TMP6:%.*]] = mul nsw i32 1, [[TMP1]]
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP6]]
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
+// SIMD-ONLY13-NEXT:    store i16 [[CONV4]], i16* [[ARRAYIDX5]], align 2
+// SIMD-ONLY13-NEXT:    [[TMP7:%.*]] = mul nsw i32 1, [[TMP1]]
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i32 [[TMP7]]
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX6]], i32 1
+// SIMD-ONLY13-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+// SIMD-ONLY13-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP8]] to i32
+// SIMD-ONLY13-NEXT:    [[TMP9:%.*]] = load i32, i32* [[B]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], [[TMP9]]
+// SIMD-ONLY13-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// SIMD-ONLY13-NEXT:    call void @llvm.stackrestore(i8* [[TMP10]])
+// SIMD-ONLY13-NEXT:    ret i32 [[ADD9]]
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_ZL7fstatici
+// SIMD-ONLY13-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[AAA:%.*]] = alloca i8, align 1
+// SIMD-ONLY13-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY13-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    store i8 0, i8* [[AAA]], align 1
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY13-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP1:%.*]] = load i8, i8* [[AAA]], align 1
+// SIMD-ONLY13-NEXT:    [[CONV:%.*]] = sext i8 [[TMP1]] to i32
+// SIMD-ONLY13-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// SIMD-ONLY13-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i8
+// SIMD-ONLY13-NEXT:    store i8 [[CONV2]], i8* [[AAA]], align 1
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i32 0, i32 2
+// SIMD-ONLY13-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY13-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    ret i32 [[TMP3]]
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// SIMD-ONLY13-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY13-NEXT:    [[B:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY13-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store i32 0, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY13-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], i32 0, i32 2
+// SIMD-ONLY13-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY13-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY13-NEXT:    store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A]], align 4
+// SIMD-ONLY13-NEXT:    ret i32 [[TMP2]]
+//
diff --git a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
index b010d0dcea192..09c4d568bb096 100644
--- a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
@@ -1,193 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK10
 // RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK12
 // RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
 
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY00 %s
 // RUN: %clang_cc1 -DCK1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY01 %s
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY02 %s
 // RUN: %clang_cc1 -DCK1 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY03 %s
 #ifdef CK1
 
 double *g;
 
-// CK1: @g ={{.*}} global ptr
-// CK1: [[SIZES00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}]
-// CK1: [[TYPES00:@.+]] = {{.+}}constant [1 x i64] [i64 288]
 
-// CK1: [[SIZES01:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
-// CK1: [[TYPES01:@.+]] = {{.+}}constant [1 x i64] [i64 288]
 
-// CK1: [[SIZES02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
-// CK1: [[TYPES02:@.+]] = {{.+}}constant [1 x i64] [i64 288]
 
-// CK1: [[SIZES03:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
-// CK1: [[TYPES03:@.+]] = {{.+}}constant [1 x i64] [i64 288]
 
-// CK1: [[SIZES04:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
-// CK1: [[TYPES04:@.+]] = {{.+}}constant [1 x i64] [i64 288]
 
-// CK1: [[SIZES05:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] {{8|4}}]
-// CK1: [[TYPES05:@.+]] = {{.+}}constant [1 x i64] [i64 288]
 
-// CK1: [[SIZES06:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] {{8|4}}]
-// CK1: [[TYPES06:@.+]] = {{.+}}constant [2 x i64] [i64 288, i64 288]
 
-// CK1-LABEL: @_Z3foo{{.*}}(
 template<typename T>
 void foo(float *&lr, T *&tr) {
   float *l;
   T *t;
 
-// CK1-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK1-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK1-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK1-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK1-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
-// CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-// CK1-DAG: store ptr [[VAL:%.+]], ptr [[BP1]]
-// CK1-DAG: store ptr [[VAL]], ptr [[P1]]
-// CK1-DAG: [[VAL]] = load ptr, ptr [[ADDR:@g]],
-
-// CK1: call void [[KERNEL:@.+]](ptr [[VAL]])
+
 #pragma omp target is_device_ptr(g)
   {
     ++g;
   }
 
-// CK1-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK1-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK1-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK1-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK1-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
-// CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-// CK1-DAG: store ptr [[VAL:%.+]], ptr [[BP1]]
-// CK1-DAG: store ptr [[VAL]], ptr [[P1]]
-// CK1-DAG: [[VAL]] = load ptr, ptr [[ADDR:%.+]],
-
-// CK1: call void [[KERNEL:@.+]](ptr [[VAL]])
+
 #pragma omp target is_device_ptr(l)
   {
     ++l;
   }
 
-// CK1-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK1-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK1-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK1-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK1-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
-// CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-// CK1-DAG: store ptr [[VAL:%.+]], ptr [[BP1]]
-// CK1-DAG: store ptr [[VAL]], ptr [[P1]]
-// CK1-DAG: [[VAL]] = load ptr, ptr [[ADDR:%.+]],
-
-// CK1: call void [[KERNEL:@.+]](ptr [[VAL]])
+
 #pragma omp target is_device_ptr(t)
   {
     ++t;
   }
 
-// CK1-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK1-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK1-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK1-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK1-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
-// CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-// CK1-DAG: store ptr [[VAL:%.+]], ptr [[BP1]]
-// CK1-DAG: store ptr [[VAL]], ptr [[P1]]
-// CK1-DAG: [[VAL]] = load ptr, ptr [[ADDR:%.+]],
-// CK1-DAG: [[ADDR]] = load ptr, ptr [[ADDR2:%.+]],
-
-// CK1: call void [[KERNEL:@.+]](ptr [[VAL]])
+
 #pragma omp target is_device_ptr(lr)
   {
     ++lr;
   }
 
-// CK1-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK1-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK1-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK1-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK1-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
-// CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-// CK1-DAG: store ptr [[VAL:%.+]], ptr [[BP1]]
-// CK1-DAG: store ptr [[VAL]], ptr [[P1]]
-// CK1-DAG: [[VAL]] = load ptr, ptr [[ADDR:%.+]],
-// CK1-DAG: [[ADDR]] = load ptr, ptr [[ADDR2:%.+]],
-
-// CK1: call void [[KERNEL:@.+]](ptr [[VAL]])
+
 #pragma omp target is_device_ptr(tr)
   {
     ++tr;
   }
 
-// CK1-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK1-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK1-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK1-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK1-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
-// CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-// CK1-DAG: store ptr [[VAL:%.+]], ptr [[BP1]]
-// CK1-DAG: store ptr [[VAL]], ptr [[P1]]
-// CK1-DAG: [[VAL]] = load ptr, ptr [[ADDR:%.+]],
-// CK1-DAG: [[ADDR]] = load ptr, ptr [[ADDR2:%.+]],
-
-// CK1: call void [[KERNEL:@.+]](ptr [[VAL]])
+
 #pragma omp target is_device_ptr(tr, lr)
   {
     ++tr;
   }
 
-// CK1-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK1-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK1-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK1-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK1-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK1-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
-// CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
-// CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-// CK1-DAG: store ptr [[VAL:%.+]], ptr [[BP1]]
-// CK1-DAG: store ptr [[VAL]], ptr [[P1]]
-// CK1-DAG: [[VAL]] = load ptr, ptr [[ADDR:%.+]],
-// CK1-DAG: [[ADDR]] = load ptr, ptr [[ADDR2:%.+]],
-
-// CK1-DAG: [[_BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 1
-// CK1-DAG: [[_P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 1
-// CK1-DAG: store ptr [[_VAL:%.+]], ptr [[_BP1]]
-// CK1-DAG: store ptr [[_VAL]], ptr [[_P1]]
-// CK1-DAG: [[_VAL]] = load ptr, ptr [[_ADDR:%.+]],
-// CK1-DAG: [[_ADDR]] = load ptr, ptr [[_ADDR2:%.+]],
-
-// CK1: call void [[KERNEL:@.+]](ptr [[VAL]], ptr [[_VAL]])
+
+
 #pragma omp target is_device_ptr(tr, lr)
   {
     ++tr,++lr;
@@ -200,23 +83,21 @@ void bar(float *&a, int *&b) {
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK20
 // RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK21
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK22
 // RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23
 
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY10 %s
 // RUN: %clang_cc1 -DCK2 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY11 %s
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY12 %s
 // RUN: %clang_cc1 -DCK2 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
-// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY13 %s
 #ifdef CK2
 
-// CK2: [[ST:%.+]] = type { ptr, ptr }
 
 template <typename T>
 struct ST {
@@ -224,70 +105,23 @@ struct ST {
   double *&b;
   ST(double *&b) : a(0), b(b) {}
 
-  // CK2-LABEL: @{{.*}}foo{{.*}}
   void foo(double *&arg) {
     int *la = 0;
 
-// CK2-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK2-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK2-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK2-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK2-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK2-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
-// CK2-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
-
-// CK2-DAG: [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1:%.+]], i32 0, i32 0
-// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
-// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
-// CK2-DAG: store ptr [[THIS1]], ptr [[BP0]]
-// CK2-DAG: store ptr [[A]], ptr [[P0]]
+
 #pragma omp target is_device_ptr(a)
     {
       a++;
     }
 
-// CK2-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK2-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK2-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK2-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK2-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK2-DAG: [[SARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 4
-// CK2-DAG: store ptr [[SIZE:%.+]], ptr [[SARG]]
-// CK2-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
-// CK2-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
-
-// CK2-DAG: [[S:%[^,]+]] = sdiv exact i64 [[SZ:%.+]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
-// CK2-DAG: [[SIZE:%[^,]+]] = getelementptr inbounds [2 x i64], ptr %.offload_sizes, i32 0, i32 0
-// CK2-DAG: store i64 [[S]], ptr [[SIZE]]
-// CK2-DAG: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
-// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
-// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
-// CK2-DAG:  store ptr [[THIS1]], ptr [[BP0]]
-// CK2-DAG: store ptr [[B]], ptr [[P0]]
+
 #pragma omp target is_device_ptr(b)
     {
       b++;
     }
 
-// CK2-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-// CK2-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-// CK2-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-// CK2-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-// CK2-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-// CK2-DAG: [[SARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 4
-// CK2-DAG: store ptr [[SIZE:%.+]], ptr [[SARG]]
-// CK2-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
-// CK2-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
-
-// CK2-DAG: [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
-// CK2-DAG: [[B9:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
-// CK2-DAG: [[S:%[^,]+]] = sdiv exact i64 [[SZ:%.+]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
-// CK2-DAG: store i64 [[S]], ptr [[SIZE:%.+]]
-
-// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
-// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
-// CK2-DAG:  store ptr [[THIS1]], ptr [[BP0]]
-// CK2-DAG: store ptr [[A8]], ptr [[TMP64:%.+]]
+
+
 #pragma omp target is_device_ptr(a, b)
     {
       a++;
@@ -303,42 +137,5595 @@ void bar(double *arg){
 }
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK30
 // RUN: %clang_cc1 -DCK3 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
-// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK31
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK32
 // RUN: %clang_cc1 -DCK3 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK33
 
-// RUN: %clang_cc1 -DCK3 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY20 %s
 // RUN: %clang_cc1 -DCK3 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
-// RUN: %clang_cc1 -DCK3 -verify -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY21 %s
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY22 %s
 // RUN: %clang_cc1 -DCK3 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
-// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY23 %s
 #ifdef CK3
 
-// CK3-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[SZ:64|32]]] [i{{64|32}} {{8|4}}]
 // OMP_MAP_TARGET_PARAM = 0x20 | OMP_MAP_TO = 0x1 = 0x21
-// CK3-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i64] [i64 [[#0x21]]]
 void bar() {
   __attribute__((aligned(64))) double *ptr;
-  // CK3-DAG: [[RET:%.+]] = call i32 @__tgt_target_kernel(ptr @{{.+}}, i64 [[DEVICE:.+]], i32 -1, i32 0, ptr @.{{.+}}.region_id, ptr [[ARGS:%.+]])
-  // CK3-DAG: [[BPARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 2
-  // CK3-DAG: store ptr [[BPGEP:%.+]], ptr [[BPARG]]
-  // CK3-DAG: [[PARG:%.+]] = getelementptr inbounds {{.+}}[[ARGS]], i32 0, i32 3
-  // CK3-DAG: store ptr [[PGEP:%.+]], ptr [[PARG]]
-  // CK3-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
-  // CK3-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
-  // CK3-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
-  // CK3-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-  // CK3-DAG: store ptr [[PTR:%.+]], ptr [[BP1]]
-  // CK3-DAG: store ptr [[PTR]], ptr [[P1]]
-
-  // CK3: call void [[KERNEL:@.+]](ptr [[PTR]])
+
 #pragma omp target is_device_ptr(ptr)
   *ptr = 0;
 }
 #endif
 #endif
+// CK1-64-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// CK1-64-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CK1-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK1-64-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP1]])
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// CK1-64-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[LR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[TR:%.*]]) #[[ATTR0]] comdat {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[L:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[T:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS1:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_PTRS2:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS3:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS7:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_PTRS8:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS9:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS13:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_PTRS14:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS15:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[_TMP19:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS20:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_PTRS21:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS22:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[_TMP26:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [1 x ptr], align 8
+// CK1-64-NEXT:    [[_TMP33:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[_TMP34:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_BASEPTRS35:%.*]] = alloca [2 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_PTRS36:%.*]] = alloca [2 x ptr], align 8
+// CK1-64-NEXT:    [[DOTOFFLOAD_MAPPERS37:%.*]] = alloca [2 x ptr], align 8
+// CK1-64-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
+// CK1-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
+// CK1-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
+// CK1-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK1-64-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CK1-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK1-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK1-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK1-64-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK1-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK1-64-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK1-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK1-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK1-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK1-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK1-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK1-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 8
+// CK1-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK1-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 8
+// CK1-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK1-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK1-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK1-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK1-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK1-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK1-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK1-64-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK1-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK1-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK1-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK1-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK1-64-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK1-64-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37.region_id, ptr [[KERNEL_ARGS]])
+// CK1-64-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK1-64-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK1-64:       omp_offload.failed:
+// CK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK1-64:       omp_offload.cont:
+// CK1-64-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[L]], align 8
+// CK1-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP21]], ptr [[TMP22]], align 8
+// CK1-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP21]], ptr [[TMP23]], align 8
+// CK1-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS3]], i64 0, i64 0
+// CK1-64-NEXT:    store ptr null, ptr [[TMP24]], align 8
+// CK1-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK1-64-NEXT:    [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
+// CK1-64-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK1-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
+// CK1-64-NEXT:    store i32 1, ptr [[TMP28]], align 4
+// CK1-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
+// CK1-64-NEXT:    store ptr [[TMP25]], ptr [[TMP29]], align 8
+// CK1-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3
+// CK1-64-NEXT:    store ptr [[TMP26]], ptr [[TMP30]], align 8
+// CK1-64-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4
+// CK1-64-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP31]], align 8
+// CK1-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5
+// CK1-64-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP32]], align 8
+// CK1-64-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6
+// CK1-64-NEXT:    store ptr null, ptr [[TMP33]], align 8
+// CK1-64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7
+// CK1-64-NEXT:    store ptr null, ptr [[TMP34]], align 8
+// CK1-64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8
+// CK1-64-NEXT:    store i64 0, ptr [[TMP35]], align 8
+// CK1-64-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9
+// CK1-64-NEXT:    store i64 0, ptr [[TMP36]], align 8
+// CK1-64-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10
+// CK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP37]], align 4
+// CK1-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11
+// CK1-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CK1-64-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12
+// CK1-64-NEXT:    store i32 0, ptr [[TMP39]], align 4
+// CK1-64-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43.region_id, ptr [[KERNEL_ARGS4]])
+// CK1-64-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CK1-64-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
+// CK1-64:       omp_offload.failed5:
+// CK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43(ptr [[TMP21]]) #[[ATTR2]]
+// CK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
+// CK1-64:       omp_offload.cont6:
+// CK1-64-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[T]], align 8
+// CK1-64-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP42]], ptr [[TMP43]], align 8
+// CK1-64-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP42]], ptr [[TMP44]], align 8
+// CK1-64-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 0
+// CK1-64-NEXT:    store ptr null, ptr [[TMP45]], align 8
+// CK1-64-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK1-64-NEXT:    [[KERNEL_ARGS10:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-64-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
+// CK1-64-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK1-64-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
+// CK1-64-NEXT:    store i32 1, ptr [[TMP49]], align 4
+// CK1-64-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
+// CK1-64-NEXT:    store ptr [[TMP46]], ptr [[TMP50]], align 8
+// CK1-64-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 3
+// CK1-64-NEXT:    store ptr [[TMP47]], ptr [[TMP51]], align 8
+// CK1-64-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 4
+// CK1-64-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP52]], align 8
+// CK1-64-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 5
+// CK1-64-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP53]], align 8
+// CK1-64-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 6
+// CK1-64-NEXT:    store ptr null, ptr [[TMP54]], align 8
+// CK1-64-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 7
+// CK1-64-NEXT:    store ptr null, ptr [[TMP55]], align 8
+// CK1-64-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 8
+// CK1-64-NEXT:    store i64 0, ptr [[TMP56]], align 8
+// CK1-64-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 9
+// CK1-64-NEXT:    store i64 0, ptr [[TMP57]], align 8
+// CK1-64-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 10
+// CK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP58]], align 4
+// CK1-64-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 11
+// CK1-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP59]], align 4
+// CK1-64-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 12
+// CK1-64-NEXT:    store i32 0, ptr [[TMP60]], align 4
+// CK1-64-NEXT:    [[TMP61:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49.region_id, ptr [[KERNEL_ARGS10]])
+// CK1-64-NEXT:    [[TMP62:%.*]] = icmp ne i32 [[TMP61]], 0
+// CK1-64-NEXT:    br i1 [[TMP62]], label [[OMP_OFFLOAD_FAILED11:%.*]], label [[OMP_OFFLOAD_CONT12:%.*]]
+// CK1-64:       omp_offload.failed11:
+// CK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49(ptr [[TMP42]]) #[[ATTR2]]
+// CK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT12]]
+// CK1-64:       omp_offload.cont12:
+// CK1-64-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TMP63]], ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP64]], align 8
+// CK1-64-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP65]], ptr [[TMP66]], align 8
+// CK1-64-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP65]], ptr [[TMP67]], align 8
+// CK1-64-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS15]], i64 0, i64 0
+// CK1-64-NEXT:    store ptr null, ptr [[TMP68]], align 8
+// CK1-64-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK1-64-NEXT:    [[KERNEL_ARGS16:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-64-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
+// CK1-64-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK1-64-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
+// CK1-64-NEXT:    store i32 1, ptr [[TMP72]], align 4
+// CK1-64-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
+// CK1-64-NEXT:    store ptr [[TMP69]], ptr [[TMP73]], align 8
+// CK1-64-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 3
+// CK1-64-NEXT:    store ptr [[TMP70]], ptr [[TMP74]], align 8
+// CK1-64-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 4
+// CK1-64-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP75]], align 8
+// CK1-64-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 5
+// CK1-64-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP76]], align 8
+// CK1-64-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 6
+// CK1-64-NEXT:    store ptr null, ptr [[TMP77]], align 8
+// CK1-64-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 7
+// CK1-64-NEXT:    store ptr null, ptr [[TMP78]], align 8
+// CK1-64-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 8
+// CK1-64-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK1-64-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 9
+// CK1-64-NEXT:    store i64 0, ptr [[TMP80]], align 8
+// CK1-64-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 10
+// CK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP81]], align 4
+// CK1-64-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 11
+// CK1-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP82]], align 4
+// CK1-64-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 12
+// CK1-64-NEXT:    store i32 0, ptr [[TMP83]], align 4
+// CK1-64-NEXT:    [[TMP84:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55.region_id, ptr [[KERNEL_ARGS16]])
+// CK1-64-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
+// CK1-64-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
+// CK1-64:       omp_offload.failed17:
+// CK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55(ptr [[TMP65]]) #[[ATTR2]]
+// CK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
+// CK1-64:       omp_offload.cont18:
+// CK1-64-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TMP86]], ptr [[_TMP19]], align 8
+// CK1-64-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[_TMP19]], align 8
+// CK1-64-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP87]], align 8
+// CK1-64-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP88]], ptr [[TMP89]], align 8
+// CK1-64-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP88]], ptr [[TMP90]], align 8
+// CK1-64-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS22]], i64 0, i64 0
+// CK1-64-NEXT:    store ptr null, ptr [[TMP91]], align 8
+// CK1-64-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK1-64-NEXT:    [[KERNEL_ARGS23:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-64-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
+// CK1-64-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK1-64-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
+// CK1-64-NEXT:    store i32 1, ptr [[TMP95]], align 4
+// CK1-64-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
+// CK1-64-NEXT:    store ptr [[TMP92]], ptr [[TMP96]], align 8
+// CK1-64-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 3
+// CK1-64-NEXT:    store ptr [[TMP93]], ptr [[TMP97]], align 8
+// CK1-64-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 4
+// CK1-64-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP98]], align 8
+// CK1-64-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 5
+// CK1-64-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP99]], align 8
+// CK1-64-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 6
+// CK1-64-NEXT:    store ptr null, ptr [[TMP100]], align 8
+// CK1-64-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 7
+// CK1-64-NEXT:    store ptr null, ptr [[TMP101]], align 8
+// CK1-64-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 8
+// CK1-64-NEXT:    store i64 0, ptr [[TMP102]], align 8
+// CK1-64-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 9
+// CK1-64-NEXT:    store i64 0, ptr [[TMP103]], align 8
+// CK1-64-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 10
+// CK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP104]], align 4
+// CK1-64-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 11
+// CK1-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP105]], align 4
+// CK1-64-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 12
+// CK1-64-NEXT:    store i32 0, ptr [[TMP106]], align 4
+// CK1-64-NEXT:    [[TMP107:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61.region_id, ptr [[KERNEL_ARGS23]])
+// CK1-64-NEXT:    [[TMP108:%.*]] = icmp ne i32 [[TMP107]], 0
+// CK1-64-NEXT:    br i1 [[TMP108]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
+// CK1-64:       omp_offload.failed24:
+// CK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61(ptr [[TMP88]]) #[[ATTR2]]
+// CK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
+// CK1-64:       omp_offload.cont25:
+// CK1-64-NEXT:    [[TMP109:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TMP109]], ptr [[_TMP26]], align 8
+// CK1-64-NEXT:    [[TMP110:%.*]] = load ptr, ptr [[_TMP26]], align 8
+// CK1-64-NEXT:    [[TMP111:%.*]] = load ptr, ptr [[TMP110]], align 8
+// CK1-64-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP111]], ptr [[TMP112]], align 8
+// CK1-64-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP111]], ptr [[TMP113]], align 8
+// CK1-64-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 0
+// CK1-64-NEXT:    store ptr null, ptr [[TMP114]], align 8
+// CK1-64-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK1-64-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-64-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
+// CK1-64-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK1-64-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
+// CK1-64-NEXT:    store i32 1, ptr [[TMP118]], align 4
+// CK1-64-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
+// CK1-64-NEXT:    store ptr [[TMP115]], ptr [[TMP119]], align 8
+// CK1-64-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
+// CK1-64-NEXT:    store ptr [[TMP116]], ptr [[TMP120]], align 8
+// CK1-64-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
+// CK1-64-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP121]], align 8
+// CK1-64-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
+// CK1-64-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP122]], align 8
+// CK1-64-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
+// CK1-64-NEXT:    store ptr null, ptr [[TMP123]], align 8
+// CK1-64-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
+// CK1-64-NEXT:    store ptr null, ptr [[TMP124]], align 8
+// CK1-64-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
+// CK1-64-NEXT:    store i64 0, ptr [[TMP125]], align 8
+// CK1-64-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
+// CK1-64-NEXT:    store i64 0, ptr [[TMP126]], align 8
+// CK1-64-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
+// CK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP127]], align 4
+// CK1-64-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
+// CK1-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP128]], align 4
+// CK1-64-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
+// CK1-64-NEXT:    store i32 0, ptr [[TMP129]], align 4
+// CK1-64-NEXT:    [[TMP130:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67.region_id, ptr [[KERNEL_ARGS30]])
+// CK1-64-NEXT:    [[TMP131:%.*]] = icmp ne i32 [[TMP130]], 0
+// CK1-64-NEXT:    br i1 [[TMP131]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
+// CK1-64:       omp_offload.failed31:
+// CK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67(ptr [[TMP111]]) #[[ATTR2]]
+// CK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT32]]
+// CK1-64:       omp_offload.cont32:
+// CK1-64-NEXT:    [[TMP132:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TMP132]], ptr [[_TMP33]], align 8
+// CK1-64-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TMP133]], ptr [[_TMP34]], align 8
+// CK1-64-NEXT:    [[TMP134:%.*]] = load ptr, ptr [[_TMP33]], align 8
+// CK1-64-NEXT:    [[TMP135:%.*]] = load ptr, ptr [[TMP134]], align 8
+// CK1-64-NEXT:    [[TMP136:%.*]] = load ptr, ptr [[_TMP34]], align 8
+// CK1-64-NEXT:    [[TMP137:%.*]] = load ptr, ptr [[TMP136]], align 8
+// CK1-64-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP135]], ptr [[TMP138]], align 8
+// CK1-64-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK1-64-NEXT:    store ptr [[TMP135]], ptr [[TMP139]], align 8
+// CK1-64-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i64 0, i64 0
+// CK1-64-NEXT:    store ptr null, ptr [[TMP140]], align 8
+// CK1-64-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 1
+// CK1-64-NEXT:    store ptr [[TMP137]], ptr [[TMP141]], align 8
+// CK1-64-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 1
+// CK1-64-NEXT:    store ptr [[TMP137]], ptr [[TMP142]], align 8
+// CK1-64-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i64 0, i64 1
+// CK1-64-NEXT:    store ptr null, ptr [[TMP143]], align 8
+// CK1-64-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK1-64-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK1-64-NEXT:    [[KERNEL_ARGS38:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-64-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
+// CK1-64-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK1-64-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
+// CK1-64-NEXT:    store i32 2, ptr [[TMP147]], align 4
+// CK1-64-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
+// CK1-64-NEXT:    store ptr [[TMP144]], ptr [[TMP148]], align 8
+// CK1-64-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 3
+// CK1-64-NEXT:    store ptr [[TMP145]], ptr [[TMP149]], align 8
+// CK1-64-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 4
+// CK1-64-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP150]], align 8
+// CK1-64-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 5
+// CK1-64-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP151]], align 8
+// CK1-64-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 6
+// CK1-64-NEXT:    store ptr null, ptr [[TMP152]], align 8
+// CK1-64-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 7
+// CK1-64-NEXT:    store ptr null, ptr [[TMP153]], align 8
+// CK1-64-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 8
+// CK1-64-NEXT:    store i64 0, ptr [[TMP154]], align 8
+// CK1-64-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 9
+// CK1-64-NEXT:    store i64 0, ptr [[TMP155]], align 8
+// CK1-64-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 10
+// CK1-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP156]], align 4
+// CK1-64-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 11
+// CK1-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP157]], align 4
+// CK1-64-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 12
+// CK1-64-NEXT:    store i32 0, ptr [[TMP158]], align 4
+// CK1-64-NEXT:    [[TMP159:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74.region_id, ptr [[KERNEL_ARGS38]])
+// CK1-64-NEXT:    [[TMP160:%.*]] = icmp ne i32 [[TMP159]], 0
+// CK1-64-NEXT:    br i1 [[TMP160]], label [[OMP_OFFLOAD_FAILED39:%.*]], label [[OMP_OFFLOAD_CONT40:%.*]]
+// CK1-64:       omp_offload.failed39:
+// CK1-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74(ptr [[TMP135]], ptr [[TMP137]]) #[[ATTR2]]
+// CK1-64-NEXT:    br label [[OMP_OFFLOAD_CONT40]]
+// CK1-64:       omp_offload.cont40:
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37
+// CK1-64-SAME: (ptr noundef [[G:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CK1-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK1-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43
+// CK1-64-SAME: (ptr noundef [[L:%.*]]) #[[ATTR1]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CK1-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK1-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49
+// CK1-64-SAME: (ptr noundef [[T:%.*]]) #[[ATTR1]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+// CK1-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK1-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55
+// CK1-64-SAME: (ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK1-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK1-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61
+// CK1-64-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK1-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK1-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67
+// CK1-64-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK1-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK1-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74
+// CK1-64-SAME: (ptr noundef [[TR:%.*]], ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
+// CK1-64-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK1-64-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK1-64-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8
+// CK1-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK1-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK1-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK1-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK1-64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CK1-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CK1-64-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK1-64-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8
+// CK1-64-NEXT:    ret void
+// CK1-64-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK1-64-SAME: () #[[ATTR3:[0-9]+]] {
+// CK1-64-NEXT:  entry:
+// CK1-64-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK1-64-NEXT:    ret void
+// CK2-64-LABEL: define {{[^@]+}}@_Z3barPd
+// CK2-64-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK2-64-NEXT:  entry:
+// CK2-64-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 8
+// CK2-64-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// CK2-64-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// CK2-64-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// CK2-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
+// CK2-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK2-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
+// CK2-64-NEXT:    ret void
+// CK2-64-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// CK2-64-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CK2-64-NEXT:  entry:
+// CK2-64-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK2-64-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK2-64-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[THIS1]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]])
+// CK2-64-NEXT:    ret void
+// CK2-64-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// CK2-64-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CK2-64-NEXT:  entry:
+// CK2-64-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    [[LA:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [2 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [3 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [3 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [3 x ptr], align 8
+// CK2-64-NEXT:    [[DOTOFFLOAD_SIZES13:%.*]] = alloca [3 x i64], align 8
+// CK2-64-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// CK2-64-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    store ptr null, ptr [[LA]], align 8
+// CK2-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr [[THIS1]], ptr [[TMP0]], align 8
+// CK2-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr [[A]], ptr [[TMP1]], align 8
+// CK2-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK2-64-NEXT:    store ptr null, ptr [[TMP2]], align 8
+// CK2-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK2-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK2-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK2-64-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK2-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK2-64-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK2-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK2-64-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CK2-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK2-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK2-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK2-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CK2-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK2-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CK2-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK2-64-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CK2-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK2-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK2-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK2-64-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK2-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK2-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK2-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK2-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK2-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK2-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK2-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK2-64-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK2-64-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112.region_id, ptr [[KERNEL_ARGS]])
+// CK2-64-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK2-64-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK2-64:       omp_offload.failed:
+// CK2-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CK2-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK2-64:       omp_offload.cont:
+// CK2-64-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK2-64-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[B]], align 8
+// CK2-64-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[B]], i32 1
+// CK2-64-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[TMP21]] to i64
+// CK2-64-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[B]] to i64
+// CK2-64-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP22]], [[TMP23]]
+// CK2-64-NEXT:    [[TMP25:%.*]] = sdiv exact i64 [[TMP24]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK2-64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes.1, i64 16, i1 false)
+// CK2-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr [[THIS1]], ptr [[TMP26]], align 8
+// CK2-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr [[B]], ptr [[TMP27]], align 8
+// CK2-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK2-64-NEXT:    store i64 [[TMP25]], ptr [[TMP28]], align 8
+// CK2-64-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 0
+// CK2-64-NEXT:    store ptr null, ptr [[TMP29]], align 8
+// CK2-64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CK2-64-NEXT:    store ptr [[THIS1]], ptr [[TMP30]], align 8
+// CK2-64-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CK2-64-NEXT:    store ptr [[TMP20]], ptr [[TMP31]], align 8
+// CK2-64-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 1
+// CK2-64-NEXT:    store ptr null, ptr [[TMP32]], align 8
+// CK2-64-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK2-64-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK2-64-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CK2-64-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK2-64-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CK2-64-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CK2-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CK2-64-NEXT:    store ptr [[TMP33]], ptr [[TMP38]], align 8
+// CK2-64-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CK2-64-NEXT:    store ptr [[TMP34]], ptr [[TMP39]], align 8
+// CK2-64-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CK2-64-NEXT:    store ptr [[TMP35]], ptr [[TMP40]], align 8
+// CK2-64-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CK2-64-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP41]], align 8
+// CK2-64-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CK2-64-NEXT:    store ptr null, ptr [[TMP42]], align 8
+// CK2-64-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CK2-64-NEXT:    store ptr null, ptr [[TMP43]], align 8
+// CK2-64-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CK2-64-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CK2-64-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CK2-64-NEXT:    store i64 0, ptr [[TMP45]], align 8
+// CK2-64-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CK2-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP46]], align 4
+// CK2-64-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CK2-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CK2-64-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CK2-64-NEXT:    store i32 0, ptr [[TMP48]], align 4
+// CK2-64-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118.region_id, ptr [[KERNEL_ARGS5]])
+// CK2-64-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CK2-64-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CK2-64:       omp_offload.failed6:
+// CK2-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]]
+// CK2-64-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CK2-64:       omp_offload.cont7:
+// CK2-64-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// CK2-64-NEXT:    [[B9:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK2-64-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[B9]], align 8
+// CK2-64-NEXT:    [[TMP52:%.*]] = getelementptr ptr, ptr [[B9]], i32 1
+// CK2-64-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64
+// CK2-64-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[A8]] to i64
+// CK2-64-NEXT:    [[TMP55:%.*]] = sub i64 [[TMP53]], [[TMP54]]
+// CK2-64-NEXT:    [[TMP56:%.*]] = sdiv exact i64 [[TMP55]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK2-64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES13]], ptr align 8 @.offload_sizes.3, i64 24, i1 false)
+// CK2-64-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr [[THIS1]], ptr [[TMP57]], align 8
+// CK2-64-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr [[A8]], ptr [[TMP58]], align 8
+// CK2-64-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK2-64-NEXT:    store i64 [[TMP56]], ptr [[TMP59]], align 8
+// CK2-64-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 0
+// CK2-64-NEXT:    store ptr null, ptr [[TMP60]], align 8
+// CK2-64-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CK2-64-NEXT:    store ptr [[THIS1]], ptr [[TMP61]], align 8
+// CK2-64-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CK2-64-NEXT:    store ptr [[A8]], ptr [[TMP62]], align 8
+// CK2-64-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 1
+// CK2-64-NEXT:    store ptr null, ptr [[TMP63]], align 8
+// CK2-64-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 2
+// CK2-64-NEXT:    store ptr [[THIS1]], ptr [[TMP64]], align 8
+// CK2-64-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 2
+// CK2-64-NEXT:    store ptr [[TMP51]], ptr [[TMP65]], align 8
+// CK2-64-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 2
+// CK2-64-NEXT:    store ptr null, ptr [[TMP66]], align 8
+// CK2-64-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK2-64-NEXT:    [[KERNEL_ARGS14:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK2-64-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
+// CK2-64-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK2-64-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
+// CK2-64-NEXT:    store i32 3, ptr [[TMP71]], align 4
+// CK2-64-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
+// CK2-64-NEXT:    store ptr [[TMP67]], ptr [[TMP72]], align 8
+// CK2-64-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 3
+// CK2-64-NEXT:    store ptr [[TMP68]], ptr [[TMP73]], align 8
+// CK2-64-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 4
+// CK2-64-NEXT:    store ptr [[TMP69]], ptr [[TMP74]], align 8
+// CK2-64-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 5
+// CK2-64-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP75]], align 8
+// CK2-64-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 6
+// CK2-64-NEXT:    store ptr null, ptr [[TMP76]], align 8
+// CK2-64-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 7
+// CK2-64-NEXT:    store ptr null, ptr [[TMP77]], align 8
+// CK2-64-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 8
+// CK2-64-NEXT:    store i64 0, ptr [[TMP78]], align 8
+// CK2-64-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 9
+// CK2-64-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK2-64-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 10
+// CK2-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP80]], align 4
+// CK2-64-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 11
+// CK2-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP81]], align 4
+// CK2-64-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 12
+// CK2-64-NEXT:    store i32 0, ptr [[TMP82]], align 4
+// CK2-64-NEXT:    [[TMP83:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125.region_id, ptr [[KERNEL_ARGS14]])
+// CK2-64-NEXT:    [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0
+// CK2-64-NEXT:    br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
+// CK2-64:       omp_offload.failed15:
+// CK2-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]]
+// CK2-64-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
+// CK2-64:       omp_offload.cont16:
+// CK2-64-NEXT:    ret void
+// CK2-64-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// CK2-64-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CK2-64-NEXT:  entry:
+// CK2-64-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK2-64-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK2-64-NEXT:    store ptr null, ptr [[A]], align 8
+// CK2-64-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK2-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK2-64-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 8
+// CK2-64-NEXT:    ret void
+// CK2-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112
+// CK2-64-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CK2-64-NEXT:  entry:
+// CK2-64-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
+// CK2-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK2-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
+// CK2-64-NEXT:    ret void
+// CK2-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118
+// CK2-64-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK2-64-NEXT:  entry:
+// CK2-64-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
+// CK2-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
+// CK2-64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CK2-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK2-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
+// CK2-64-NEXT:    ret void
+// CK2-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125
+// CK2-64-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK2-64-NEXT:  entry:
+// CK2-64-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK2-64-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK2-64-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK2-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
+// CK2-64-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK2-64-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
+// CK2-64-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
+// CK2-64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 8
+// CK2-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CK2-64-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK2-64-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8
+// CK2-64-NEXT:    ret void
+// CK2-64-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK2-64-SAME: () #[[ATTR5:[0-9]+]] {
+// CK2-64-NEXT:  entry:
+// CK2-64-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK2-64-NEXT:    ret void
+// CK2-32-LABEL: define {{[^@]+}}@_Z3barPd
+// CK2-32-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK2-32-NEXT:  entry:
+// CK2-32-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
+// CK2-32-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// CK2-32-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// CK2-32-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// CK2-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
+// CK2-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK2-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
+// CK2-32-NEXT:    ret void
+// CK2-32-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// CK2-32-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CK2-32-NEXT:  entry:
+// CK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK2-32-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK2-32-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]])
+// CK2-32-NEXT:    ret void
+// CK2-32-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// CK2-32-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CK2-32-NEXT:  entry:
+// CK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    [[LA:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [2 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [3 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [3 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [3 x ptr], align 4
+// CK2-32-NEXT:    [[DOTOFFLOAD_SIZES13:%.*]] = alloca [3 x i64], align 4
+// CK2-32-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// CK2-32-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    store ptr null, ptr [[LA]], align 4
+// CK2-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr [[THIS1]], ptr [[TMP0]], align 4
+// CK2-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr [[A]], ptr [[TMP1]], align 4
+// CK2-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr null, ptr [[TMP2]], align 4
+// CK2-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK2-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK2-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK2-32-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK2-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK2-32-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK2-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK2-32-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CK2-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK2-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK2-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK2-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 4
+// CK2-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK2-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 4
+// CK2-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK2-32-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CK2-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK2-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK2-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK2-32-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK2-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK2-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK2-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK2-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK2-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK2-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK2-32-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK2-32-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112.region_id, ptr [[KERNEL_ARGS]])
+// CK2-32-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK2-32-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK2-32:       omp_offload.failed:
+// CK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK2-32:       omp_offload.cont:
+// CK2-32-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK2-32-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[B]], align 4
+// CK2-32-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[B]], i32 1
+// CK2-32-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[TMP21]] to i64
+// CK2-32-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[B]] to i64
+// CK2-32-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP22]], [[TMP23]]
+// CK2-32-NEXT:    [[TMP25:%.*]] = sdiv exact i64 [[TMP24]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK2-32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes.1, i32 16, i1 false)
+// CK2-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr [[THIS1]], ptr [[TMP26]], align 4
+// CK2-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr [[B]], ptr [[TMP27]], align 4
+// CK2-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK2-32-NEXT:    store i64 [[TMP25]], ptr [[TMP28]], align 4
+// CK2-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr null, ptr [[TMP29]], align 4
+// CK2-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CK2-32-NEXT:    store ptr [[THIS1]], ptr [[TMP30]], align 4
+// CK2-32-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CK2-32-NEXT:    store ptr [[TMP20]], ptr [[TMP31]], align 4
+// CK2-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 1
+// CK2-32-NEXT:    store ptr null, ptr [[TMP32]], align 4
+// CK2-32-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK2-32-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK2-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CK2-32-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK2-32-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CK2-32-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CK2-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CK2-32-NEXT:    store ptr [[TMP33]], ptr [[TMP38]], align 4
+// CK2-32-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CK2-32-NEXT:    store ptr [[TMP34]], ptr [[TMP39]], align 4
+// CK2-32-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CK2-32-NEXT:    store ptr [[TMP35]], ptr [[TMP40]], align 4
+// CK2-32-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CK2-32-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP41]], align 4
+// CK2-32-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CK2-32-NEXT:    store ptr null, ptr [[TMP42]], align 4
+// CK2-32-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CK2-32-NEXT:    store ptr null, ptr [[TMP43]], align 4
+// CK2-32-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CK2-32-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CK2-32-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CK2-32-NEXT:    store i64 0, ptr [[TMP45]], align 8
+// CK2-32-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP46]], align 4
+// CK2-32-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CK2-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CK2-32-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CK2-32-NEXT:    store i32 0, ptr [[TMP48]], align 4
+// CK2-32-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118.region_id, ptr [[KERNEL_ARGS5]])
+// CK2-32-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CK2-32-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CK2-32:       omp_offload.failed6:
+// CK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]]
+// CK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CK2-32:       omp_offload.cont7:
+// CK2-32-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// CK2-32-NEXT:    [[B9:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK2-32-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[B9]], align 4
+// CK2-32-NEXT:    [[TMP52:%.*]] = getelementptr ptr, ptr [[B9]], i32 1
+// CK2-32-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64
+// CK2-32-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[A8]] to i64
+// CK2-32-NEXT:    [[TMP55:%.*]] = sub i64 [[TMP53]], [[TMP54]]
+// CK2-32-NEXT:    [[TMP56:%.*]] = sdiv exact i64 [[TMP55]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK2-32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES13]], ptr align 4 @.offload_sizes.3, i32 24, i1 false)
+// CK2-32-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr [[THIS1]], ptr [[TMP57]], align 4
+// CK2-32-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr [[A8]], ptr [[TMP58]], align 4
+// CK2-32-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK2-32-NEXT:    store i64 [[TMP56]], ptr [[TMP59]], align 4
+// CK2-32-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr null, ptr [[TMP60]], align 4
+// CK2-32-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CK2-32-NEXT:    store ptr [[THIS1]], ptr [[TMP61]], align 4
+// CK2-32-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CK2-32-NEXT:    store ptr [[A8]], ptr [[TMP62]], align 4
+// CK2-32-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 1
+// CK2-32-NEXT:    store ptr null, ptr [[TMP63]], align 4
+// CK2-32-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 2
+// CK2-32-NEXT:    store ptr [[THIS1]], ptr [[TMP64]], align 4
+// CK2-32-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 2
+// CK2-32-NEXT:    store ptr [[TMP51]], ptr [[TMP65]], align 4
+// CK2-32-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 2
+// CK2-32-NEXT:    store ptr null, ptr [[TMP66]], align 4
+// CK2-32-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK2-32-NEXT:    [[KERNEL_ARGS14:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK2-32-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
+// CK2-32-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK2-32-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
+// CK2-32-NEXT:    store i32 3, ptr [[TMP71]], align 4
+// CK2-32-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
+// CK2-32-NEXT:    store ptr [[TMP67]], ptr [[TMP72]], align 4
+// CK2-32-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 3
+// CK2-32-NEXT:    store ptr [[TMP68]], ptr [[TMP73]], align 4
+// CK2-32-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 4
+// CK2-32-NEXT:    store ptr [[TMP69]], ptr [[TMP74]], align 4
+// CK2-32-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 5
+// CK2-32-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP75]], align 4
+// CK2-32-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 6
+// CK2-32-NEXT:    store ptr null, ptr [[TMP76]], align 4
+// CK2-32-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 7
+// CK2-32-NEXT:    store ptr null, ptr [[TMP77]], align 4
+// CK2-32-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 8
+// CK2-32-NEXT:    store i64 0, ptr [[TMP78]], align 8
+// CK2-32-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 9
+// CK2-32-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK2-32-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 10
+// CK2-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP80]], align 4
+// CK2-32-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 11
+// CK2-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP81]], align 4
+// CK2-32-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 12
+// CK2-32-NEXT:    store i32 0, ptr [[TMP82]], align 4
+// CK2-32-NEXT:    [[TMP83:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125.region_id, ptr [[KERNEL_ARGS14]])
+// CK2-32-NEXT:    [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0
+// CK2-32-NEXT:    br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
+// CK2-32:       omp_offload.failed15:
+// CK2-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]]
+// CK2-32-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
+// CK2-32:       omp_offload.cont16:
+// CK2-32-NEXT:    ret void
+// CK2-32-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// CK2-32-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CK2-32-NEXT:  entry:
+// CK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK2-32-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK2-32-NEXT:    store ptr null, ptr [[A]], align 4
+// CK2-32-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK2-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK2-32-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 4
+// CK2-32-NEXT:    ret void
+// CK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112
+// CK2-32-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CK2-32-NEXT:  entry:
+// CK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
+// CK2-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK2-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
+// CK2-32-NEXT:    ret void
+// CK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118
+// CK2-32-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK2-32-NEXT:  entry:
+// CK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
+// CK2-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
+// CK2-32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
+// CK2-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK2-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4
+// CK2-32-NEXT:    ret void
+// CK2-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125
+// CK2-32-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK2-32-NEXT:  entry:
+// CK2-32-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK2-32-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK2-32-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK2-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
+// CK2-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK2-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
+// CK2-32-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
+// CK2-32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 4
+// CK2-32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
+// CK2-32-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK2-32-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4
+// CK2-32-NEXT:    ret void
+// CK2-32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK2-32-SAME: () #[[ATTR5:[0-9]+]] {
+// CK2-32-NEXT:  entry:
+// CK2-32-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK2-32-NEXT:    ret void
+// CK3-64-LABEL: define {{[^@]+}}@_Z3barv
+// CK3-64-SAME: () #[[ATTR0:[0-9]+]] {
+// CK3-64-NEXT:  entry:
+// CK3-64-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// CK3-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK3-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK3-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK3-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// CK3-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK3-64-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
+// CK3-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK3-64-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
+// CK3-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK3-64-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CK3-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK3-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK3-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK3-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK3-64-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK3-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK3-64-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK3-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK3-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK3-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK3-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK3-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK3-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 8
+// CK3-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK3-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 8
+// CK3-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK3-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK3-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK3-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK3-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK3-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK3-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK3-64-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK3-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK3-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK3-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK3-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK3-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK3-64-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK3-64-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159.region_id, ptr [[KERNEL_ARGS]])
+// CK3-64-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK3-64-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK3-64:       omp_offload.failed:
+// CK3-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK3-64-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK3-64:       omp_offload.cont:
+// CK3-64-NEXT:    ret void
+// CK3-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159
+// CK3-64-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK3-64-NEXT:  entry:
+// CK3-64-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CK3-64-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CK3-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CK3-64-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 8
+// CK3-64-NEXT:    ret void
+// CK3-64-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK3-64-SAME: () #[[ATTR3:[0-9]+]] {
+// CK3-64-NEXT:  entry:
+// CK3-64-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK3-64-NEXT:    ret void
+// CK3-32-LABEL: define {{[^@]+}}@_Z3barv
+// CK3-32-SAME: () #[[ATTR0:[0-9]+]] {
+// CK3-32-NEXT:  entry:
+// CK3-32-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// CK3-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK3-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK3-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK3-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// CK3-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK3-32-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
+// CK3-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK3-32-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
+// CK3-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK3-32-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK3-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK3-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK3-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK3-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK3-32-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK3-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK3-32-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK3-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK3-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK3-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK3-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK3-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK3-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK3-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK3-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK3-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK3-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK3-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK3-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK3-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK3-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK3-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK3-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK3-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK3-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK3-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK3-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK3-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK3-32-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK3-32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159.region_id, ptr [[KERNEL_ARGS]])
+// CK3-32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK3-32-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK3-32:       omp_offload.failed:
+// CK3-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK3-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK3-32:       omp_offload.cont:
+// CK3-32-NEXT:    ret void
+// CK3-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159
+// CK3-32-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK3-32-NEXT:  entry:
+// CK3-32-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 4
+// CK3-32-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
+// CK3-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
+// CK3-32-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 4
+// CK3-32-NEXT:    ret void
+// CK3-32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK3-32-SAME: () #[[ATTR3:[0-9]+]] {
+// CK3-32-NEXT:  entry:
+// CK3-32-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK3-32-NEXT:    ret void
+// CK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK1-SAME: () #[[ATTR3:[0-9]+]] {
+// CK1-NEXT:  entry:
+// CK1-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK1-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// CK1-32-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CK1-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK1-32-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// CK1-32-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[TR:%.*]]) #[[ATTR0]] comdat {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[L:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[T:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_BASEPTRS1:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_PTRS2:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_MAPPERS3:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_BASEPTRS7:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_PTRS8:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_MAPPERS9:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_BASEPTRS13:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_PTRS14:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_MAPPERS15:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[_TMP19:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_BASEPTRS20:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_PTRS21:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_MAPPERS22:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[_TMP26:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [1 x ptr], align 4
+// CK1-32-NEXT:    [[_TMP33:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[_TMP34:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_BASEPTRS35:%.*]] = alloca [2 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_PTRS36:%.*]] = alloca [2 x ptr], align 4
+// CK1-32-NEXT:    [[DOTOFFLOAD_MAPPERS37:%.*]] = alloca [2 x ptr], align 4
+// CK1-32-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
+// CK1-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
+// CK1-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
+// CK1-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK1-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK1-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK1-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK1-32-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK1-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK1-32-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK1-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK1-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK1-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK1-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK1-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK1-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK1-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK1-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK1-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK1-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK1-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK1-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK1-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK1-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK1-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK1-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK1-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK1-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK1-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK1-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK1-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK1-32-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK1-32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37.region_id, ptr [[KERNEL_ARGS]])
+// CK1-32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK1-32-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK1-32:       omp_offload.failed:
+// CK1-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK1-32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK1-32:       omp_offload.cont:
+// CK1-32-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[L]], align 4
+// CK1-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP21]], ptr [[TMP22]], align 4
+// CK1-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP21]], ptr [[TMP23]], align 4
+// CK1-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS3]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr null, ptr [[TMP24]], align 4
+// CK1-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK1-32-NEXT:    [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-32-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
+// CK1-32-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK1-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
+// CK1-32-NEXT:    store i32 1, ptr [[TMP28]], align 4
+// CK1-32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
+// CK1-32-NEXT:    store ptr [[TMP25]], ptr [[TMP29]], align 4
+// CK1-32-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3
+// CK1-32-NEXT:    store ptr [[TMP26]], ptr [[TMP30]], align 4
+// CK1-32-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4
+// CK1-32-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP31]], align 4
+// CK1-32-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5
+// CK1-32-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP32]], align 4
+// CK1-32-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6
+// CK1-32-NEXT:    store ptr null, ptr [[TMP33]], align 4
+// CK1-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7
+// CK1-32-NEXT:    store ptr null, ptr [[TMP34]], align 4
+// CK1-32-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8
+// CK1-32-NEXT:    store i64 0, ptr [[TMP35]], align 8
+// CK1-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9
+// CK1-32-NEXT:    store i64 0, ptr [[TMP36]], align 8
+// CK1-32-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10
+// CK1-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP37]], align 4
+// CK1-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11
+// CK1-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CK1-32-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12
+// CK1-32-NEXT:    store i32 0, ptr [[TMP39]], align 4
+// CK1-32-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43.region_id, ptr [[KERNEL_ARGS4]])
+// CK1-32-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CK1-32-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
+// CK1-32:       omp_offload.failed5:
+// CK1-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43(ptr [[TMP21]]) #[[ATTR2]]
+// CK1-32-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
+// CK1-32:       omp_offload.cont6:
+// CK1-32-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[T]], align 4
+// CK1-32-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP42]], ptr [[TMP43]], align 4
+// CK1-32-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP42]], ptr [[TMP44]], align 4
+// CK1-32-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS9]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr null, ptr [[TMP45]], align 4
+// CK1-32-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK1-32-NEXT:    [[KERNEL_ARGS10:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-32-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
+// CK1-32-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK1-32-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
+// CK1-32-NEXT:    store i32 1, ptr [[TMP49]], align 4
+// CK1-32-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
+// CK1-32-NEXT:    store ptr [[TMP46]], ptr [[TMP50]], align 4
+// CK1-32-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 3
+// CK1-32-NEXT:    store ptr [[TMP47]], ptr [[TMP51]], align 4
+// CK1-32-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 4
+// CK1-32-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP52]], align 4
+// CK1-32-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 5
+// CK1-32-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP53]], align 4
+// CK1-32-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 6
+// CK1-32-NEXT:    store ptr null, ptr [[TMP54]], align 4
+// CK1-32-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 7
+// CK1-32-NEXT:    store ptr null, ptr [[TMP55]], align 4
+// CK1-32-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 8
+// CK1-32-NEXT:    store i64 0, ptr [[TMP56]], align 8
+// CK1-32-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 9
+// CK1-32-NEXT:    store i64 0, ptr [[TMP57]], align 8
+// CK1-32-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 10
+// CK1-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP58]], align 4
+// CK1-32-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 11
+// CK1-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP59]], align 4
+// CK1-32-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 12
+// CK1-32-NEXT:    store i32 0, ptr [[TMP60]], align 4
+// CK1-32-NEXT:    [[TMP61:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49.region_id, ptr [[KERNEL_ARGS10]])
+// CK1-32-NEXT:    [[TMP62:%.*]] = icmp ne i32 [[TMP61]], 0
+// CK1-32-NEXT:    br i1 [[TMP62]], label [[OMP_OFFLOAD_FAILED11:%.*]], label [[OMP_OFFLOAD_CONT12:%.*]]
+// CK1-32:       omp_offload.failed11:
+// CK1-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49(ptr [[TMP42]]) #[[ATTR2]]
+// CK1-32-NEXT:    br label [[OMP_OFFLOAD_CONT12]]
+// CK1-32:       omp_offload.cont12:
+// CK1-32-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TMP63]], ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP64]], align 4
+// CK1-32-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP65]], ptr [[TMP66]], align 4
+// CK1-32-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP65]], ptr [[TMP67]], align 4
+// CK1-32-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS15]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr null, ptr [[TMP68]], align 4
+// CK1-32-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK1-32-NEXT:    [[KERNEL_ARGS16:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-32-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
+// CK1-32-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK1-32-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
+// CK1-32-NEXT:    store i32 1, ptr [[TMP72]], align 4
+// CK1-32-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
+// CK1-32-NEXT:    store ptr [[TMP69]], ptr [[TMP73]], align 4
+// CK1-32-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 3
+// CK1-32-NEXT:    store ptr [[TMP70]], ptr [[TMP74]], align 4
+// CK1-32-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 4
+// CK1-32-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP75]], align 4
+// CK1-32-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 5
+// CK1-32-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP76]], align 4
+// CK1-32-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 6
+// CK1-32-NEXT:    store ptr null, ptr [[TMP77]], align 4
+// CK1-32-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 7
+// CK1-32-NEXT:    store ptr null, ptr [[TMP78]], align 4
+// CK1-32-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 8
+// CK1-32-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK1-32-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 9
+// CK1-32-NEXT:    store i64 0, ptr [[TMP80]], align 8
+// CK1-32-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 10
+// CK1-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP81]], align 4
+// CK1-32-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 11
+// CK1-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP82]], align 4
+// CK1-32-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 12
+// CK1-32-NEXT:    store i32 0, ptr [[TMP83]], align 4
+// CK1-32-NEXT:    [[TMP84:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55.region_id, ptr [[KERNEL_ARGS16]])
+// CK1-32-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
+// CK1-32-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
+// CK1-32:       omp_offload.failed17:
+// CK1-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55(ptr [[TMP65]]) #[[ATTR2]]
+// CK1-32-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
+// CK1-32:       omp_offload.cont18:
+// CK1-32-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TMP86]], ptr [[_TMP19]], align 4
+// CK1-32-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[_TMP19]], align 4
+// CK1-32-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP87]], align 4
+// CK1-32-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP88]], ptr [[TMP89]], align 4
+// CK1-32-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP88]], ptr [[TMP90]], align 4
+// CK1-32-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS22]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr null, ptr [[TMP91]], align 4
+// CK1-32-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK1-32-NEXT:    [[KERNEL_ARGS23:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-32-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
+// CK1-32-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK1-32-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
+// CK1-32-NEXT:    store i32 1, ptr [[TMP95]], align 4
+// CK1-32-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
+// CK1-32-NEXT:    store ptr [[TMP92]], ptr [[TMP96]], align 4
+// CK1-32-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 3
+// CK1-32-NEXT:    store ptr [[TMP93]], ptr [[TMP97]], align 4
+// CK1-32-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 4
+// CK1-32-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP98]], align 4
+// CK1-32-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 5
+// CK1-32-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP99]], align 4
+// CK1-32-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 6
+// CK1-32-NEXT:    store ptr null, ptr [[TMP100]], align 4
+// CK1-32-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 7
+// CK1-32-NEXT:    store ptr null, ptr [[TMP101]], align 4
+// CK1-32-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 8
+// CK1-32-NEXT:    store i64 0, ptr [[TMP102]], align 8
+// CK1-32-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 9
+// CK1-32-NEXT:    store i64 0, ptr [[TMP103]], align 8
+// CK1-32-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 10
+// CK1-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP104]], align 4
+// CK1-32-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 11
+// CK1-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP105]], align 4
+// CK1-32-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 12
+// CK1-32-NEXT:    store i32 0, ptr [[TMP106]], align 4
+// CK1-32-NEXT:    [[TMP107:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61.region_id, ptr [[KERNEL_ARGS23]])
+// CK1-32-NEXT:    [[TMP108:%.*]] = icmp ne i32 [[TMP107]], 0
+// CK1-32-NEXT:    br i1 [[TMP108]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
+// CK1-32:       omp_offload.failed24:
+// CK1-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61(ptr [[TMP88]]) #[[ATTR2]]
+// CK1-32-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
+// CK1-32:       omp_offload.cont25:
+// CK1-32-NEXT:    [[TMP109:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TMP109]], ptr [[_TMP26]], align 4
+// CK1-32-NEXT:    [[TMP110:%.*]] = load ptr, ptr [[_TMP26]], align 4
+// CK1-32-NEXT:    [[TMP111:%.*]] = load ptr, ptr [[TMP110]], align 4
+// CK1-32-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP111]], ptr [[TMP112]], align 4
+// CK1-32-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP111]], ptr [[TMP113]], align 4
+// CK1-32-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr null, ptr [[TMP114]], align 4
+// CK1-32-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK1-32-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-32-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
+// CK1-32-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK1-32-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
+// CK1-32-NEXT:    store i32 1, ptr [[TMP118]], align 4
+// CK1-32-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
+// CK1-32-NEXT:    store ptr [[TMP115]], ptr [[TMP119]], align 4
+// CK1-32-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
+// CK1-32-NEXT:    store ptr [[TMP116]], ptr [[TMP120]], align 4
+// CK1-32-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
+// CK1-32-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP121]], align 4
+// CK1-32-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
+// CK1-32-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP122]], align 4
+// CK1-32-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
+// CK1-32-NEXT:    store ptr null, ptr [[TMP123]], align 4
+// CK1-32-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
+// CK1-32-NEXT:    store ptr null, ptr [[TMP124]], align 4
+// CK1-32-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
+// CK1-32-NEXT:    store i64 0, ptr [[TMP125]], align 8
+// CK1-32-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
+// CK1-32-NEXT:    store i64 0, ptr [[TMP126]], align 8
+// CK1-32-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
+// CK1-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP127]], align 4
+// CK1-32-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
+// CK1-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP128]], align 4
+// CK1-32-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
+// CK1-32-NEXT:    store i32 0, ptr [[TMP129]], align 4
+// CK1-32-NEXT:    [[TMP130:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67.region_id, ptr [[KERNEL_ARGS30]])
+// CK1-32-NEXT:    [[TMP131:%.*]] = icmp ne i32 [[TMP130]], 0
+// CK1-32-NEXT:    br i1 [[TMP131]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
+// CK1-32:       omp_offload.failed31:
+// CK1-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67(ptr [[TMP111]]) #[[ATTR2]]
+// CK1-32-NEXT:    br label [[OMP_OFFLOAD_CONT32]]
+// CK1-32:       omp_offload.cont32:
+// CK1-32-NEXT:    [[TMP132:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TMP132]], ptr [[_TMP33]], align 4
+// CK1-32-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TMP133]], ptr [[_TMP34]], align 4
+// CK1-32-NEXT:    [[TMP134:%.*]] = load ptr, ptr [[_TMP33]], align 4
+// CK1-32-NEXT:    [[TMP135:%.*]] = load ptr, ptr [[TMP134]], align 4
+// CK1-32-NEXT:    [[TMP136:%.*]] = load ptr, ptr [[_TMP34]], align 4
+// CK1-32-NEXT:    [[TMP137:%.*]] = load ptr, ptr [[TMP136]], align 4
+// CK1-32-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP135]], ptr [[TMP138]], align 4
+// CK1-32-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr [[TMP135]], ptr [[TMP139]], align 4
+// CK1-32-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i32 0, i32 0
+// CK1-32-NEXT:    store ptr null, ptr [[TMP140]], align 4
+// CK1-32-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 1
+// CK1-32-NEXT:    store ptr [[TMP137]], ptr [[TMP141]], align 4
+// CK1-32-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 1
+// CK1-32-NEXT:    store ptr [[TMP137]], ptr [[TMP142]], align 4
+// CK1-32-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i32 0, i32 1
+// CK1-32-NEXT:    store ptr null, ptr [[TMP143]], align 4
+// CK1-32-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK1-32-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK1-32-NEXT:    [[KERNEL_ARGS38:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK1-32-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
+// CK1-32-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK1-32-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
+// CK1-32-NEXT:    store i32 2, ptr [[TMP147]], align 4
+// CK1-32-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
+// CK1-32-NEXT:    store ptr [[TMP144]], ptr [[TMP148]], align 4
+// CK1-32-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 3
+// CK1-32-NEXT:    store ptr [[TMP145]], ptr [[TMP149]], align 4
+// CK1-32-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 4
+// CK1-32-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP150]], align 4
+// CK1-32-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 5
+// CK1-32-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP151]], align 4
+// CK1-32-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 6
+// CK1-32-NEXT:    store ptr null, ptr [[TMP152]], align 4
+// CK1-32-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 7
+// CK1-32-NEXT:    store ptr null, ptr [[TMP153]], align 4
+// CK1-32-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 8
+// CK1-32-NEXT:    store i64 0, ptr [[TMP154]], align 8
+// CK1-32-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 9
+// CK1-32-NEXT:    store i64 0, ptr [[TMP155]], align 8
+// CK1-32-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 10
+// CK1-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP156]], align 4
+// CK1-32-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 11
+// CK1-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP157]], align 4
+// CK1-32-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 12
+// CK1-32-NEXT:    store i32 0, ptr [[TMP158]], align 4
+// CK1-32-NEXT:    [[TMP159:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74.region_id, ptr [[KERNEL_ARGS38]])
+// CK1-32-NEXT:    [[TMP160:%.*]] = icmp ne i32 [[TMP159]], 0
+// CK1-32-NEXT:    br i1 [[TMP160]], label [[OMP_OFFLOAD_FAILED39:%.*]], label [[OMP_OFFLOAD_CONT40:%.*]]
+// CK1-32:       omp_offload.failed39:
+// CK1-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74(ptr [[TMP135]], ptr [[TMP137]]) #[[ATTR2]]
+// CK1-32-NEXT:    br label [[OMP_OFFLOAD_CONT40]]
+// CK1-32:       omp_offload.cont40:
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37
+// CK1-32-SAME: (ptr noundef [[G:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CK1-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK1-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43
+// CK1-32-SAME: (ptr noundef [[L:%.*]]) #[[ATTR1]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4
+// CK1-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK1-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49
+// CK1-32-SAME: (ptr noundef [[T:%.*]]) #[[ATTR1]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4
+// CK1-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK1-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55
+// CK1-32-SAME: (ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK1-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK1-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61
+// CK1-32-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK1-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK1-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67
+// CK1-32-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK1-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK1-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74
+// CK1-32-SAME: (ptr noundef [[TR:%.*]], ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 4
+// CK1-32-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK1-32-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK1-32-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4
+// CK1-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK1-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK1-32-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK1-32-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK1-32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4
+// CK1-32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
+// CK1-32-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK1-32-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4
+// CK1-32-NEXT:    ret void
+// CK1-32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK1-32-SAME: () #[[ATTR3:[0-9]+]] {
+// CK1-32-NEXT:  entry:
+// CK1-32-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK1-32-NEXT:    ret void
+// CK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK2-SAME: () #[[ATTR5:[0-9]+]] {
+// CK2-NEXT:  entry:
+// CK2-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK2-NEXT:    ret void
+// CK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK3-SAME: () #[[ATTR3:[0-9]+]] {
+// CK3-NEXT:  entry:
+// CK3-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK3-NEXT:    ret void
+// CK10-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// CK10-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CK10-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK10-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP1]])
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// CK10-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[LR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[TR:%.*]]) #[[ATTR0]] comdat {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[L:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[T:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_BASEPTRS1:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_PTRS2:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_MAPPERS3:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_BASEPTRS7:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_PTRS8:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_MAPPERS9:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[KERNEL_ARGS10:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK10-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[DOTOFFLOAD_BASEPTRS13:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_PTRS14:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_MAPPERS15:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[KERNEL_ARGS16:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK10-NEXT:    [[_TMP19:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[DOTOFFLOAD_BASEPTRS20:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_PTRS21:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_MAPPERS22:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[KERNEL_ARGS23:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK10-NEXT:    [[_TMP26:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [1 x ptr], align 8
+// CK10-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK10-NEXT:    [[_TMP33:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[_TMP34:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[DOTOFFLOAD_BASEPTRS35:%.*]] = alloca [2 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_PTRS36:%.*]] = alloca [2 x ptr], align 8
+// CK10-NEXT:    [[DOTOFFLOAD_MAPPERS37:%.*]] = alloca [2 x ptr], align 8
+// CK10-NEXT:    [[KERNEL_ARGS38:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK10-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
+// CK10-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
+// CK10-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
+// CK10-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK10-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CK10-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK10-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK10-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK10-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK10-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK10-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK10-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK10-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK10-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK10-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK10-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK10-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 8
+// CK10-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK10-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 8
+// CK10-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK10-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK10-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK10-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK10-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK10-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK10-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK10-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK10-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK10-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK10-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK10-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK10-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK10-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK10-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37.region_id, ptr [[KERNEL_ARGS]])
+// CK10-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK10-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK10:       omp_offload.failed:
+// CK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK10-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK10:       omp_offload.cont:
+// CK10-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[L]], align 8
+// CK10-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP21]], ptr [[TMP22]], align 8
+// CK10-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP21]], ptr [[TMP23]], align 8
+// CK10-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS3]], i64 0, i64 0
+// CK10-NEXT:    store ptr null, ptr [[TMP24]], align 8
+// CK10-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK10-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK10-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
+// CK10-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK10-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
+// CK10-NEXT:    store i32 1, ptr [[TMP28]], align 4
+// CK10-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
+// CK10-NEXT:    store ptr [[TMP25]], ptr [[TMP29]], align 8
+// CK10-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3
+// CK10-NEXT:    store ptr [[TMP26]], ptr [[TMP30]], align 8
+// CK10-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4
+// CK10-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP31]], align 8
+// CK10-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5
+// CK10-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP32]], align 8
+// CK10-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6
+// CK10-NEXT:    store ptr null, ptr [[TMP33]], align 8
+// CK10-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7
+// CK10-NEXT:    store ptr null, ptr [[TMP34]], align 8
+// CK10-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8
+// CK10-NEXT:    store i64 0, ptr [[TMP35]], align 8
+// CK10-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9
+// CK10-NEXT:    store i64 0, ptr [[TMP36]], align 8
+// CK10-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10
+// CK10-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP37]], align 4
+// CK10-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11
+// CK10-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CK10-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12
+// CK10-NEXT:    store i32 0, ptr [[TMP39]], align 4
+// CK10-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43.region_id, ptr [[KERNEL_ARGS4]])
+// CK10-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CK10-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
+// CK10:       omp_offload.failed5:
+// CK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43(ptr [[TMP21]]) #[[ATTR2]]
+// CK10-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
+// CK10:       omp_offload.cont6:
+// CK10-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[T]], align 8
+// CK10-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP42]], ptr [[TMP43]], align 8
+// CK10-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP42]], ptr [[TMP44]], align 8
+// CK10-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 0
+// CK10-NEXT:    store ptr null, ptr [[TMP45]], align 8
+// CK10-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK10-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK10-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
+// CK10-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK10-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
+// CK10-NEXT:    store i32 1, ptr [[TMP49]], align 4
+// CK10-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
+// CK10-NEXT:    store ptr [[TMP46]], ptr [[TMP50]], align 8
+// CK10-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 3
+// CK10-NEXT:    store ptr [[TMP47]], ptr [[TMP51]], align 8
+// CK10-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 4
+// CK10-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP52]], align 8
+// CK10-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 5
+// CK10-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP53]], align 8
+// CK10-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 6
+// CK10-NEXT:    store ptr null, ptr [[TMP54]], align 8
+// CK10-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 7
+// CK10-NEXT:    store ptr null, ptr [[TMP55]], align 8
+// CK10-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 8
+// CK10-NEXT:    store i64 0, ptr [[TMP56]], align 8
+// CK10-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 9
+// CK10-NEXT:    store i64 0, ptr [[TMP57]], align 8
+// CK10-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 10
+// CK10-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP58]], align 4
+// CK10-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 11
+// CK10-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP59]], align 4
+// CK10-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 12
+// CK10-NEXT:    store i32 0, ptr [[TMP60]], align 4
+// CK10-NEXT:    [[TMP61:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49.region_id, ptr [[KERNEL_ARGS10]])
+// CK10-NEXT:    [[TMP62:%.*]] = icmp ne i32 [[TMP61]], 0
+// CK10-NEXT:    br i1 [[TMP62]], label [[OMP_OFFLOAD_FAILED11:%.*]], label [[OMP_OFFLOAD_CONT12:%.*]]
+// CK10:       omp_offload.failed11:
+// CK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49(ptr [[TMP42]]) #[[ATTR2]]
+// CK10-NEXT:    br label [[OMP_OFFLOAD_CONT12]]
+// CK10:       omp_offload.cont12:
+// CK10-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TMP63]], ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP64]], align 8
+// CK10-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP65]], ptr [[TMP66]], align 8
+// CK10-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP65]], ptr [[TMP67]], align 8
+// CK10-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS15]], i64 0, i64 0
+// CK10-NEXT:    store ptr null, ptr [[TMP68]], align 8
+// CK10-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK10-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK10-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
+// CK10-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK10-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
+// CK10-NEXT:    store i32 1, ptr [[TMP72]], align 4
+// CK10-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
+// CK10-NEXT:    store ptr [[TMP69]], ptr [[TMP73]], align 8
+// CK10-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 3
+// CK10-NEXT:    store ptr [[TMP70]], ptr [[TMP74]], align 8
+// CK10-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 4
+// CK10-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP75]], align 8
+// CK10-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 5
+// CK10-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP76]], align 8
+// CK10-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 6
+// CK10-NEXT:    store ptr null, ptr [[TMP77]], align 8
+// CK10-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 7
+// CK10-NEXT:    store ptr null, ptr [[TMP78]], align 8
+// CK10-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 8
+// CK10-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK10-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 9
+// CK10-NEXT:    store i64 0, ptr [[TMP80]], align 8
+// CK10-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 10
+// CK10-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP81]], align 4
+// CK10-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 11
+// CK10-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP82]], align 4
+// CK10-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 12
+// CK10-NEXT:    store i32 0, ptr [[TMP83]], align 4
+// CK10-NEXT:    [[TMP84:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55.region_id, ptr [[KERNEL_ARGS16]])
+// CK10-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
+// CK10-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
+// CK10:       omp_offload.failed17:
+// CK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55(ptr [[TMP65]]) #[[ATTR2]]
+// CK10-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
+// CK10:       omp_offload.cont18:
+// CK10-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TMP86]], ptr [[_TMP19]], align 8
+// CK10-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[_TMP19]], align 8
+// CK10-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP87]], align 8
+// CK10-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP88]], ptr [[TMP89]], align 8
+// CK10-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP88]], ptr [[TMP90]], align 8
+// CK10-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS22]], i64 0, i64 0
+// CK10-NEXT:    store ptr null, ptr [[TMP91]], align 8
+// CK10-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK10-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK10-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
+// CK10-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK10-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
+// CK10-NEXT:    store i32 1, ptr [[TMP95]], align 4
+// CK10-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
+// CK10-NEXT:    store ptr [[TMP92]], ptr [[TMP96]], align 8
+// CK10-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 3
+// CK10-NEXT:    store ptr [[TMP93]], ptr [[TMP97]], align 8
+// CK10-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 4
+// CK10-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP98]], align 8
+// CK10-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 5
+// CK10-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP99]], align 8
+// CK10-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 6
+// CK10-NEXT:    store ptr null, ptr [[TMP100]], align 8
+// CK10-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 7
+// CK10-NEXT:    store ptr null, ptr [[TMP101]], align 8
+// CK10-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 8
+// CK10-NEXT:    store i64 0, ptr [[TMP102]], align 8
+// CK10-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 9
+// CK10-NEXT:    store i64 0, ptr [[TMP103]], align 8
+// CK10-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 10
+// CK10-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP104]], align 4
+// CK10-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 11
+// CK10-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP105]], align 4
+// CK10-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 12
+// CK10-NEXT:    store i32 0, ptr [[TMP106]], align 4
+// CK10-NEXT:    [[TMP107:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61.region_id, ptr [[KERNEL_ARGS23]])
+// CK10-NEXT:    [[TMP108:%.*]] = icmp ne i32 [[TMP107]], 0
+// CK10-NEXT:    br i1 [[TMP108]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
+// CK10:       omp_offload.failed24:
+// CK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61(ptr [[TMP88]]) #[[ATTR2]]
+// CK10-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
+// CK10:       omp_offload.cont25:
+// CK10-NEXT:    [[TMP109:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TMP109]], ptr [[_TMP26]], align 8
+// CK10-NEXT:    [[TMP110:%.*]] = load ptr, ptr [[_TMP26]], align 8
+// CK10-NEXT:    [[TMP111:%.*]] = load ptr, ptr [[TMP110]], align 8
+// CK10-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP111]], ptr [[TMP112]], align 8
+// CK10-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP111]], ptr [[TMP113]], align 8
+// CK10-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 0
+// CK10-NEXT:    store ptr null, ptr [[TMP114]], align 8
+// CK10-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK10-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK10-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
+// CK10-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK10-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
+// CK10-NEXT:    store i32 1, ptr [[TMP118]], align 4
+// CK10-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
+// CK10-NEXT:    store ptr [[TMP115]], ptr [[TMP119]], align 8
+// CK10-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
+// CK10-NEXT:    store ptr [[TMP116]], ptr [[TMP120]], align 8
+// CK10-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
+// CK10-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP121]], align 8
+// CK10-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
+// CK10-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP122]], align 8
+// CK10-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
+// CK10-NEXT:    store ptr null, ptr [[TMP123]], align 8
+// CK10-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
+// CK10-NEXT:    store ptr null, ptr [[TMP124]], align 8
+// CK10-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
+// CK10-NEXT:    store i64 0, ptr [[TMP125]], align 8
+// CK10-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
+// CK10-NEXT:    store i64 0, ptr [[TMP126]], align 8
+// CK10-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
+// CK10-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP127]], align 4
+// CK10-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
+// CK10-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP128]], align 4
+// CK10-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
+// CK10-NEXT:    store i32 0, ptr [[TMP129]], align 4
+// CK10-NEXT:    [[TMP130:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67.region_id, ptr [[KERNEL_ARGS30]])
+// CK10-NEXT:    [[TMP131:%.*]] = icmp ne i32 [[TMP130]], 0
+// CK10-NEXT:    br i1 [[TMP131]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
+// CK10:       omp_offload.failed31:
+// CK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67(ptr [[TMP111]]) #[[ATTR2]]
+// CK10-NEXT:    br label [[OMP_OFFLOAD_CONT32]]
+// CK10:       omp_offload.cont32:
+// CK10-NEXT:    [[TMP132:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TMP132]], ptr [[_TMP33]], align 8
+// CK10-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TMP133]], ptr [[_TMP34]], align 8
+// CK10-NEXT:    [[TMP134:%.*]] = load ptr, ptr [[_TMP33]], align 8
+// CK10-NEXT:    [[TMP135:%.*]] = load ptr, ptr [[TMP134]], align 8
+// CK10-NEXT:    [[TMP136:%.*]] = load ptr, ptr [[_TMP34]], align 8
+// CK10-NEXT:    [[TMP137:%.*]] = load ptr, ptr [[TMP136]], align 8
+// CK10-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP135]], ptr [[TMP138]], align 8
+// CK10-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK10-NEXT:    store ptr [[TMP135]], ptr [[TMP139]], align 8
+// CK10-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i64 0, i64 0
+// CK10-NEXT:    store ptr null, ptr [[TMP140]], align 8
+// CK10-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 1
+// CK10-NEXT:    store ptr [[TMP137]], ptr [[TMP141]], align 8
+// CK10-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 1
+// CK10-NEXT:    store ptr [[TMP137]], ptr [[TMP142]], align 8
+// CK10-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i64 0, i64 1
+// CK10-NEXT:    store ptr null, ptr [[TMP143]], align 8
+// CK10-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK10-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK10-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
+// CK10-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK10-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
+// CK10-NEXT:    store i32 2, ptr [[TMP147]], align 4
+// CK10-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
+// CK10-NEXT:    store ptr [[TMP144]], ptr [[TMP148]], align 8
+// CK10-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 3
+// CK10-NEXT:    store ptr [[TMP145]], ptr [[TMP149]], align 8
+// CK10-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 4
+// CK10-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP150]], align 8
+// CK10-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 5
+// CK10-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP151]], align 8
+// CK10-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 6
+// CK10-NEXT:    store ptr null, ptr [[TMP152]], align 8
+// CK10-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 7
+// CK10-NEXT:    store ptr null, ptr [[TMP153]], align 8
+// CK10-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 8
+// CK10-NEXT:    store i64 0, ptr [[TMP154]], align 8
+// CK10-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 9
+// CK10-NEXT:    store i64 0, ptr [[TMP155]], align 8
+// CK10-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 10
+// CK10-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP156]], align 4
+// CK10-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 11
+// CK10-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP157]], align 4
+// CK10-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 12
+// CK10-NEXT:    store i32 0, ptr [[TMP158]], align 4
+// CK10-NEXT:    [[TMP159:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74.region_id, ptr [[KERNEL_ARGS38]])
+// CK10-NEXT:    [[TMP160:%.*]] = icmp ne i32 [[TMP159]], 0
+// CK10-NEXT:    br i1 [[TMP160]], label [[OMP_OFFLOAD_FAILED39:%.*]], label [[OMP_OFFLOAD_CONT40:%.*]]
+// CK10:       omp_offload.failed39:
+// CK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74(ptr [[TMP135]], ptr [[TMP137]]) #[[ATTR2]]
+// CK10-NEXT:    br label [[OMP_OFFLOAD_CONT40]]
+// CK10:       omp_offload.cont40:
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37
+// CK10-SAME: (ptr noundef [[G:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43
+// CK10-SAME: (ptr noundef [[L:%.*]]) #[[ATTR1]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49
+// CK10-SAME: (ptr noundef [[T:%.*]]) #[[ATTR1]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55
+// CK10-SAME: (ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61
+// CK10-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67
+// CK10-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74
+// CK10-SAME: (ptr noundef [[TR:%.*]], ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
+// CK10-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK10-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK10-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8
+// CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK10-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CK10-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CK10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK10-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8
+// CK10-NEXT:    ret void
+//
+//
+// CK10-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK10-SAME: () #[[ATTR3:[0-9]+]] {
+// CK10-NEXT:  entry:
+// CK10-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK10-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// CK11-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK11-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP1]])
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// CK11-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[LR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[TR:%.*]]) #[[ATTR0]] comdat {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[L:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[T:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_BASEPTRS1:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_PTRS2:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_MAPPERS3:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_BASEPTRS7:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_PTRS8:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_MAPPERS9:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[KERNEL_ARGS10:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK11-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[DOTOFFLOAD_BASEPTRS13:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_PTRS14:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_MAPPERS15:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[KERNEL_ARGS16:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK11-NEXT:    [[_TMP19:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[DOTOFFLOAD_BASEPTRS20:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_PTRS21:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_MAPPERS22:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[KERNEL_ARGS23:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK11-NEXT:    [[_TMP26:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [1 x ptr], align 8
+// CK11-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK11-NEXT:    [[_TMP33:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[_TMP34:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[DOTOFFLOAD_BASEPTRS35:%.*]] = alloca [2 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_PTRS36:%.*]] = alloca [2 x ptr], align 8
+// CK11-NEXT:    [[DOTOFFLOAD_MAPPERS37:%.*]] = alloca [2 x ptr], align 8
+// CK11-NEXT:    [[KERNEL_ARGS38:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK11-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
+// CK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
+// CK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
+// CK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK11-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK11-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK11-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK11-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK11-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK11-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK11-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK11-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 8
+// CK11-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK11-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 8
+// CK11-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK11-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK11-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK11-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK11-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK11-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK11-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK11-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK11-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK11-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37.region_id, ptr [[KERNEL_ARGS]])
+// CK11-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK11-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK11:       omp_offload.failed:
+// CK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK11:       omp_offload.cont:
+// CK11-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[L]], align 8
+// CK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP21]], ptr [[TMP22]], align 8
+// CK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP21]], ptr [[TMP23]], align 8
+// CK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS3]], i64 0, i64 0
+// CK11-NEXT:    store ptr null, ptr [[TMP24]], align 8
+// CK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
+// CK11-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
+// CK11-NEXT:    store i32 1, ptr [[TMP28]], align 4
+// CK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
+// CK11-NEXT:    store ptr [[TMP25]], ptr [[TMP29]], align 8
+// CK11-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3
+// CK11-NEXT:    store ptr [[TMP26]], ptr [[TMP30]], align 8
+// CK11-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4
+// CK11-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP31]], align 8
+// CK11-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5
+// CK11-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP32]], align 8
+// CK11-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6
+// CK11-NEXT:    store ptr null, ptr [[TMP33]], align 8
+// CK11-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7
+// CK11-NEXT:    store ptr null, ptr [[TMP34]], align 8
+// CK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8
+// CK11-NEXT:    store i64 0, ptr [[TMP35]], align 8
+// CK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9
+// CK11-NEXT:    store i64 0, ptr [[TMP36]], align 8
+// CK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10
+// CK11-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP37]], align 4
+// CK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11
+// CK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12
+// CK11-NEXT:    store i32 0, ptr [[TMP39]], align 4
+// CK11-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43.region_id, ptr [[KERNEL_ARGS4]])
+// CK11-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CK11-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
+// CK11:       omp_offload.failed5:
+// CK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43(ptr [[TMP21]]) #[[ATTR2]]
+// CK11-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
+// CK11:       omp_offload.cont6:
+// CK11-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[T]], align 8
+// CK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP42]], ptr [[TMP43]], align 8
+// CK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP42]], ptr [[TMP44]], align 8
+// CK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS9]], i64 0, i64 0
+// CK11-NEXT:    store ptr null, ptr [[TMP45]], align 8
+// CK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK11-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
+// CK11-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
+// CK11-NEXT:    store i32 1, ptr [[TMP49]], align 4
+// CK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
+// CK11-NEXT:    store ptr [[TMP46]], ptr [[TMP50]], align 8
+// CK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 3
+// CK11-NEXT:    store ptr [[TMP47]], ptr [[TMP51]], align 8
+// CK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 4
+// CK11-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP52]], align 8
+// CK11-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 5
+// CK11-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP53]], align 8
+// CK11-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 6
+// CK11-NEXT:    store ptr null, ptr [[TMP54]], align 8
+// CK11-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 7
+// CK11-NEXT:    store ptr null, ptr [[TMP55]], align 8
+// CK11-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 8
+// CK11-NEXT:    store i64 0, ptr [[TMP56]], align 8
+// CK11-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 9
+// CK11-NEXT:    store i64 0, ptr [[TMP57]], align 8
+// CK11-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 10
+// CK11-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP58]], align 4
+// CK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 11
+// CK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP59]], align 4
+// CK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 12
+// CK11-NEXT:    store i32 0, ptr [[TMP60]], align 4
+// CK11-NEXT:    [[TMP61:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49.region_id, ptr [[KERNEL_ARGS10]])
+// CK11-NEXT:    [[TMP62:%.*]] = icmp ne i32 [[TMP61]], 0
+// CK11-NEXT:    br i1 [[TMP62]], label [[OMP_OFFLOAD_FAILED11:%.*]], label [[OMP_OFFLOAD_CONT12:%.*]]
+// CK11:       omp_offload.failed11:
+// CK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49(ptr [[TMP42]]) #[[ATTR2]]
+// CK11-NEXT:    br label [[OMP_OFFLOAD_CONT12]]
+// CK11:       omp_offload.cont12:
+// CK11-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TMP63]], ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP64]], align 8
+// CK11-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP65]], ptr [[TMP66]], align 8
+// CK11-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP65]], ptr [[TMP67]], align 8
+// CK11-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS15]], i64 0, i64 0
+// CK11-NEXT:    store ptr null, ptr [[TMP68]], align 8
+// CK11-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK11-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK11-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
+// CK11-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK11-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
+// CK11-NEXT:    store i32 1, ptr [[TMP72]], align 4
+// CK11-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
+// CK11-NEXT:    store ptr [[TMP69]], ptr [[TMP73]], align 8
+// CK11-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 3
+// CK11-NEXT:    store ptr [[TMP70]], ptr [[TMP74]], align 8
+// CK11-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 4
+// CK11-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP75]], align 8
+// CK11-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 5
+// CK11-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP76]], align 8
+// CK11-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 6
+// CK11-NEXT:    store ptr null, ptr [[TMP77]], align 8
+// CK11-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 7
+// CK11-NEXT:    store ptr null, ptr [[TMP78]], align 8
+// CK11-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 8
+// CK11-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK11-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 9
+// CK11-NEXT:    store i64 0, ptr [[TMP80]], align 8
+// CK11-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 10
+// CK11-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP81]], align 4
+// CK11-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 11
+// CK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP82]], align 4
+// CK11-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 12
+// CK11-NEXT:    store i32 0, ptr [[TMP83]], align 4
+// CK11-NEXT:    [[TMP84:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55.region_id, ptr [[KERNEL_ARGS16]])
+// CK11-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
+// CK11-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
+// CK11:       omp_offload.failed17:
+// CK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55(ptr [[TMP65]]) #[[ATTR2]]
+// CK11-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
+// CK11:       omp_offload.cont18:
+// CK11-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TMP86]], ptr [[_TMP19]], align 8
+// CK11-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[_TMP19]], align 8
+// CK11-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP87]], align 8
+// CK11-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP88]], ptr [[TMP89]], align 8
+// CK11-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP88]], ptr [[TMP90]], align 8
+// CK11-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS22]], i64 0, i64 0
+// CK11-NEXT:    store ptr null, ptr [[TMP91]], align 8
+// CK11-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK11-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK11-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
+// CK11-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK11-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
+// CK11-NEXT:    store i32 1, ptr [[TMP95]], align 4
+// CK11-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
+// CK11-NEXT:    store ptr [[TMP92]], ptr [[TMP96]], align 8
+// CK11-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 3
+// CK11-NEXT:    store ptr [[TMP93]], ptr [[TMP97]], align 8
+// CK11-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 4
+// CK11-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP98]], align 8
+// CK11-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 5
+// CK11-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP99]], align 8
+// CK11-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 6
+// CK11-NEXT:    store ptr null, ptr [[TMP100]], align 8
+// CK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 7
+// CK11-NEXT:    store ptr null, ptr [[TMP101]], align 8
+// CK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 8
+// CK11-NEXT:    store i64 0, ptr [[TMP102]], align 8
+// CK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 9
+// CK11-NEXT:    store i64 0, ptr [[TMP103]], align 8
+// CK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 10
+// CK11-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP104]], align 4
+// CK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 11
+// CK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP105]], align 4
+// CK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 12
+// CK11-NEXT:    store i32 0, ptr [[TMP106]], align 4
+// CK11-NEXT:    [[TMP107:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61.region_id, ptr [[KERNEL_ARGS23]])
+// CK11-NEXT:    [[TMP108:%.*]] = icmp ne i32 [[TMP107]], 0
+// CK11-NEXT:    br i1 [[TMP108]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
+// CK11:       omp_offload.failed24:
+// CK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61(ptr [[TMP88]]) #[[ATTR2]]
+// CK11-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
+// CK11:       omp_offload.cont25:
+// CK11-NEXT:    [[TMP109:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TMP109]], ptr [[_TMP26]], align 8
+// CK11-NEXT:    [[TMP110:%.*]] = load ptr, ptr [[_TMP26]], align 8
+// CK11-NEXT:    [[TMP111:%.*]] = load ptr, ptr [[TMP110]], align 8
+// CK11-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP111]], ptr [[TMP112]], align 8
+// CK11-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP111]], ptr [[TMP113]], align 8
+// CK11-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 0
+// CK11-NEXT:    store ptr null, ptr [[TMP114]], align 8
+// CK11-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK11-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK11-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
+// CK11-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK11-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
+// CK11-NEXT:    store i32 1, ptr [[TMP118]], align 4
+// CK11-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
+// CK11-NEXT:    store ptr [[TMP115]], ptr [[TMP119]], align 8
+// CK11-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
+// CK11-NEXT:    store ptr [[TMP116]], ptr [[TMP120]], align 8
+// CK11-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
+// CK11-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP121]], align 8
+// CK11-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
+// CK11-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP122]], align 8
+// CK11-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
+// CK11-NEXT:    store ptr null, ptr [[TMP123]], align 8
+// CK11-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
+// CK11-NEXT:    store ptr null, ptr [[TMP124]], align 8
+// CK11-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
+// CK11-NEXT:    store i64 0, ptr [[TMP125]], align 8
+// CK11-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
+// CK11-NEXT:    store i64 0, ptr [[TMP126]], align 8
+// CK11-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
+// CK11-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP127]], align 4
+// CK11-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
+// CK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP128]], align 4
+// CK11-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
+// CK11-NEXT:    store i32 0, ptr [[TMP129]], align 4
+// CK11-NEXT:    [[TMP130:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67.region_id, ptr [[KERNEL_ARGS30]])
+// CK11-NEXT:    [[TMP131:%.*]] = icmp ne i32 [[TMP130]], 0
+// CK11-NEXT:    br i1 [[TMP131]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
+// CK11:       omp_offload.failed31:
+// CK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67(ptr [[TMP111]]) #[[ATTR2]]
+// CK11-NEXT:    br label [[OMP_OFFLOAD_CONT32]]
+// CK11:       omp_offload.cont32:
+// CK11-NEXT:    [[TMP132:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TMP132]], ptr [[_TMP33]], align 8
+// CK11-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TMP133]], ptr [[_TMP34]], align 8
+// CK11-NEXT:    [[TMP134:%.*]] = load ptr, ptr [[_TMP33]], align 8
+// CK11-NEXT:    [[TMP135:%.*]] = load ptr, ptr [[TMP134]], align 8
+// CK11-NEXT:    [[TMP136:%.*]] = load ptr, ptr [[_TMP34]], align 8
+// CK11-NEXT:    [[TMP137:%.*]] = load ptr, ptr [[TMP136]], align 8
+// CK11-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP135]], ptr [[TMP138]], align 8
+// CK11-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK11-NEXT:    store ptr [[TMP135]], ptr [[TMP139]], align 8
+// CK11-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i64 0, i64 0
+// CK11-NEXT:    store ptr null, ptr [[TMP140]], align 8
+// CK11-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 1
+// CK11-NEXT:    store ptr [[TMP137]], ptr [[TMP141]], align 8
+// CK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 1
+// CK11-NEXT:    store ptr [[TMP137]], ptr [[TMP142]], align 8
+// CK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i64 0, i64 1
+// CK11-NEXT:    store ptr null, ptr [[TMP143]], align 8
+// CK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK11-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK11-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
+// CK11-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK11-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
+// CK11-NEXT:    store i32 2, ptr [[TMP147]], align 4
+// CK11-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
+// CK11-NEXT:    store ptr [[TMP144]], ptr [[TMP148]], align 8
+// CK11-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 3
+// CK11-NEXT:    store ptr [[TMP145]], ptr [[TMP149]], align 8
+// CK11-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 4
+// CK11-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP150]], align 8
+// CK11-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 5
+// CK11-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP151]], align 8
+// CK11-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 6
+// CK11-NEXT:    store ptr null, ptr [[TMP152]], align 8
+// CK11-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 7
+// CK11-NEXT:    store ptr null, ptr [[TMP153]], align 8
+// CK11-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 8
+// CK11-NEXT:    store i64 0, ptr [[TMP154]], align 8
+// CK11-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 9
+// CK11-NEXT:    store i64 0, ptr [[TMP155]], align 8
+// CK11-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 10
+// CK11-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP156]], align 4
+// CK11-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 11
+// CK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP157]], align 4
+// CK11-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 12
+// CK11-NEXT:    store i32 0, ptr [[TMP158]], align 4
+// CK11-NEXT:    [[TMP159:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74.region_id, ptr [[KERNEL_ARGS38]])
+// CK11-NEXT:    [[TMP160:%.*]] = icmp ne i32 [[TMP159]], 0
+// CK11-NEXT:    br i1 [[TMP160]], label [[OMP_OFFLOAD_FAILED39:%.*]], label [[OMP_OFFLOAD_CONT40:%.*]]
+// CK11:       omp_offload.failed39:
+// CK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74(ptr [[TMP135]], ptr [[TMP137]]) #[[ATTR2]]
+// CK11-NEXT:    br label [[OMP_OFFLOAD_CONT40]]
+// CK11:       omp_offload.cont40:
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37
+// CK11-SAME: (ptr noundef [[G:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43
+// CK11-SAME: (ptr noundef [[L:%.*]]) #[[ATTR1]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49
+// CK11-SAME: (ptr noundef [[T:%.*]]) #[[ATTR1]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55
+// CK11-SAME: (ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61
+// CK11-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67
+// CK11-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74
+// CK11-SAME: (ptr noundef [[TR:%.*]], ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
+// CK11-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// CK11-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
+// CK11-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8
+// CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
+// CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
+// CK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CK11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CK11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK11-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8
+// CK11-NEXT:    ret void
+//
+//
+// CK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK11-SAME: () #[[ATTR3:[0-9]+]] {
+// CK11-NEXT:  entry:
+// CK11-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK11-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// CK12-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CK12-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK12-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// CK12-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[TR:%.*]]) #[[ATTR0]] comdat {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[L:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[T:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK12-NEXT:    [[DOTOFFLOAD_BASEPTRS1:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_PTRS2:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_MAPPERS3:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK12-NEXT:    [[DOTOFFLOAD_BASEPTRS7:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_PTRS8:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_MAPPERS9:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[KERNEL_ARGS10:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK12-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[DOTOFFLOAD_BASEPTRS13:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_PTRS14:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_MAPPERS15:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[KERNEL_ARGS16:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK12-NEXT:    [[_TMP19:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[DOTOFFLOAD_BASEPTRS20:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_PTRS21:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_MAPPERS22:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[KERNEL_ARGS23:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK12-NEXT:    [[_TMP26:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [1 x ptr], align 4
+// CK12-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK12-NEXT:    [[_TMP33:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[_TMP34:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[DOTOFFLOAD_BASEPTRS35:%.*]] = alloca [2 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_PTRS36:%.*]] = alloca [2 x ptr], align 4
+// CK12-NEXT:    [[DOTOFFLOAD_MAPPERS37:%.*]] = alloca [2 x ptr], align 4
+// CK12-NEXT:    [[KERNEL_ARGS38:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK12-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
+// CK12-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
+// CK12-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
+// CK12-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK12-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK12-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK12-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK12-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK12-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK12-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK12-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK12-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK12-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK12-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK12-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK12-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK12-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK12-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK12-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK12-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK12-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK12-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK12-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK12-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK12-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK12-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK12-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK12-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK12-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK12-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK12-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK12-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK12-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK12-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37.region_id, ptr [[KERNEL_ARGS]])
+// CK12-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK12-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK12:       omp_offload.failed:
+// CK12-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK12-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK12:       omp_offload.cont:
+// CK12-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[L]], align 4
+// CK12-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP21]], ptr [[TMP22]], align 4
+// CK12-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP21]], ptr [[TMP23]], align 4
+// CK12-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS3]], i32 0, i32 0
+// CK12-NEXT:    store ptr null, ptr [[TMP24]], align 4
+// CK12-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK12-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK12-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
+// CK12-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK12-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
+// CK12-NEXT:    store i32 1, ptr [[TMP28]], align 4
+// CK12-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
+// CK12-NEXT:    store ptr [[TMP25]], ptr [[TMP29]], align 4
+// CK12-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3
+// CK12-NEXT:    store ptr [[TMP26]], ptr [[TMP30]], align 4
+// CK12-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4
+// CK12-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP31]], align 4
+// CK12-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5
+// CK12-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP32]], align 4
+// CK12-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6
+// CK12-NEXT:    store ptr null, ptr [[TMP33]], align 4
+// CK12-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7
+// CK12-NEXT:    store ptr null, ptr [[TMP34]], align 4
+// CK12-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8
+// CK12-NEXT:    store i64 0, ptr [[TMP35]], align 8
+// CK12-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9
+// CK12-NEXT:    store i64 0, ptr [[TMP36]], align 8
+// CK12-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10
+// CK12-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP37]], align 4
+// CK12-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11
+// CK12-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CK12-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12
+// CK12-NEXT:    store i32 0, ptr [[TMP39]], align 4
+// CK12-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43.region_id, ptr [[KERNEL_ARGS4]])
+// CK12-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CK12-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
+// CK12:       omp_offload.failed5:
+// CK12-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43(ptr [[TMP21]]) #[[ATTR2]]
+// CK12-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
+// CK12:       omp_offload.cont6:
+// CK12-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[T]], align 4
+// CK12-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP42]], ptr [[TMP43]], align 4
+// CK12-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP42]], ptr [[TMP44]], align 4
+// CK12-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS9]], i32 0, i32 0
+// CK12-NEXT:    store ptr null, ptr [[TMP45]], align 4
+// CK12-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK12-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK12-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
+// CK12-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK12-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
+// CK12-NEXT:    store i32 1, ptr [[TMP49]], align 4
+// CK12-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
+// CK12-NEXT:    store ptr [[TMP46]], ptr [[TMP50]], align 4
+// CK12-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 3
+// CK12-NEXT:    store ptr [[TMP47]], ptr [[TMP51]], align 4
+// CK12-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 4
+// CK12-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP52]], align 4
+// CK12-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 5
+// CK12-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP53]], align 4
+// CK12-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 6
+// CK12-NEXT:    store ptr null, ptr [[TMP54]], align 4
+// CK12-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 7
+// CK12-NEXT:    store ptr null, ptr [[TMP55]], align 4
+// CK12-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 8
+// CK12-NEXT:    store i64 0, ptr [[TMP56]], align 8
+// CK12-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 9
+// CK12-NEXT:    store i64 0, ptr [[TMP57]], align 8
+// CK12-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 10
+// CK12-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP58]], align 4
+// CK12-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 11
+// CK12-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP59]], align 4
+// CK12-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 12
+// CK12-NEXT:    store i32 0, ptr [[TMP60]], align 4
+// CK12-NEXT:    [[TMP61:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49.region_id, ptr [[KERNEL_ARGS10]])
+// CK12-NEXT:    [[TMP62:%.*]] = icmp ne i32 [[TMP61]], 0
+// CK12-NEXT:    br i1 [[TMP62]], label [[OMP_OFFLOAD_FAILED11:%.*]], label [[OMP_OFFLOAD_CONT12:%.*]]
+// CK12:       omp_offload.failed11:
+// CK12-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49(ptr [[TMP42]]) #[[ATTR2]]
+// CK12-NEXT:    br label [[OMP_OFFLOAD_CONT12]]
+// CK12:       omp_offload.cont12:
+// CK12-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TMP63]], ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP64]], align 4
+// CK12-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP65]], ptr [[TMP66]], align 4
+// CK12-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP65]], ptr [[TMP67]], align 4
+// CK12-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS15]], i32 0, i32 0
+// CK12-NEXT:    store ptr null, ptr [[TMP68]], align 4
+// CK12-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK12-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK12-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
+// CK12-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK12-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
+// CK12-NEXT:    store i32 1, ptr [[TMP72]], align 4
+// CK12-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
+// CK12-NEXT:    store ptr [[TMP69]], ptr [[TMP73]], align 4
+// CK12-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 3
+// CK12-NEXT:    store ptr [[TMP70]], ptr [[TMP74]], align 4
+// CK12-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 4
+// CK12-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP75]], align 4
+// CK12-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 5
+// CK12-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP76]], align 4
+// CK12-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 6
+// CK12-NEXT:    store ptr null, ptr [[TMP77]], align 4
+// CK12-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 7
+// CK12-NEXT:    store ptr null, ptr [[TMP78]], align 4
+// CK12-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 8
+// CK12-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK12-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 9
+// CK12-NEXT:    store i64 0, ptr [[TMP80]], align 8
+// CK12-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 10
+// CK12-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP81]], align 4
+// CK12-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 11
+// CK12-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP82]], align 4
+// CK12-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 12
+// CK12-NEXT:    store i32 0, ptr [[TMP83]], align 4
+// CK12-NEXT:    [[TMP84:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55.region_id, ptr [[KERNEL_ARGS16]])
+// CK12-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
+// CK12-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
+// CK12:       omp_offload.failed17:
+// CK12-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55(ptr [[TMP65]]) #[[ATTR2]]
+// CK12-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
+// CK12:       omp_offload.cont18:
+// CK12-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TMP86]], ptr [[_TMP19]], align 4
+// CK12-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[_TMP19]], align 4
+// CK12-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP87]], align 4
+// CK12-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP88]], ptr [[TMP89]], align 4
+// CK12-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP88]], ptr [[TMP90]], align 4
+// CK12-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS22]], i32 0, i32 0
+// CK12-NEXT:    store ptr null, ptr [[TMP91]], align 4
+// CK12-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK12-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK12-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
+// CK12-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK12-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
+// CK12-NEXT:    store i32 1, ptr [[TMP95]], align 4
+// CK12-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
+// CK12-NEXT:    store ptr [[TMP92]], ptr [[TMP96]], align 4
+// CK12-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 3
+// CK12-NEXT:    store ptr [[TMP93]], ptr [[TMP97]], align 4
+// CK12-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 4
+// CK12-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP98]], align 4
+// CK12-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 5
+// CK12-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP99]], align 4
+// CK12-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 6
+// CK12-NEXT:    store ptr null, ptr [[TMP100]], align 4
+// CK12-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 7
+// CK12-NEXT:    store ptr null, ptr [[TMP101]], align 4
+// CK12-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 8
+// CK12-NEXT:    store i64 0, ptr [[TMP102]], align 8
+// CK12-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 9
+// CK12-NEXT:    store i64 0, ptr [[TMP103]], align 8
+// CK12-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 10
+// CK12-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP104]], align 4
+// CK12-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 11
+// CK12-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP105]], align 4
+// CK12-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 12
+// CK12-NEXT:    store i32 0, ptr [[TMP106]], align 4
+// CK12-NEXT:    [[TMP107:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61.region_id, ptr [[KERNEL_ARGS23]])
+// CK12-NEXT:    [[TMP108:%.*]] = icmp ne i32 [[TMP107]], 0
+// CK12-NEXT:    br i1 [[TMP108]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
+// CK12:       omp_offload.failed24:
+// CK12-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61(ptr [[TMP88]]) #[[ATTR2]]
+// CK12-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
+// CK12:       omp_offload.cont25:
+// CK12-NEXT:    [[TMP109:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TMP109]], ptr [[_TMP26]], align 4
+// CK12-NEXT:    [[TMP110:%.*]] = load ptr, ptr [[_TMP26]], align 4
+// CK12-NEXT:    [[TMP111:%.*]] = load ptr, ptr [[TMP110]], align 4
+// CK12-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP111]], ptr [[TMP112]], align 4
+// CK12-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP111]], ptr [[TMP113]], align 4
+// CK12-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 0
+// CK12-NEXT:    store ptr null, ptr [[TMP114]], align 4
+// CK12-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK12-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK12-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
+// CK12-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK12-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
+// CK12-NEXT:    store i32 1, ptr [[TMP118]], align 4
+// CK12-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
+// CK12-NEXT:    store ptr [[TMP115]], ptr [[TMP119]], align 4
+// CK12-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
+// CK12-NEXT:    store ptr [[TMP116]], ptr [[TMP120]], align 4
+// CK12-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
+// CK12-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP121]], align 4
+// CK12-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
+// CK12-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP122]], align 4
+// CK12-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
+// CK12-NEXT:    store ptr null, ptr [[TMP123]], align 4
+// CK12-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
+// CK12-NEXT:    store ptr null, ptr [[TMP124]], align 4
+// CK12-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
+// CK12-NEXT:    store i64 0, ptr [[TMP125]], align 8
+// CK12-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
+// CK12-NEXT:    store i64 0, ptr [[TMP126]], align 8
+// CK12-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
+// CK12-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP127]], align 4
+// CK12-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
+// CK12-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP128]], align 4
+// CK12-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
+// CK12-NEXT:    store i32 0, ptr [[TMP129]], align 4
+// CK12-NEXT:    [[TMP130:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67.region_id, ptr [[KERNEL_ARGS30]])
+// CK12-NEXT:    [[TMP131:%.*]] = icmp ne i32 [[TMP130]], 0
+// CK12-NEXT:    br i1 [[TMP131]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
+// CK12:       omp_offload.failed31:
+// CK12-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67(ptr [[TMP111]]) #[[ATTR2]]
+// CK12-NEXT:    br label [[OMP_OFFLOAD_CONT32]]
+// CK12:       omp_offload.cont32:
+// CK12-NEXT:    [[TMP132:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TMP132]], ptr [[_TMP33]], align 4
+// CK12-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TMP133]], ptr [[_TMP34]], align 4
+// CK12-NEXT:    [[TMP134:%.*]] = load ptr, ptr [[_TMP33]], align 4
+// CK12-NEXT:    [[TMP135:%.*]] = load ptr, ptr [[TMP134]], align 4
+// CK12-NEXT:    [[TMP136:%.*]] = load ptr, ptr [[_TMP34]], align 4
+// CK12-NEXT:    [[TMP137:%.*]] = load ptr, ptr [[TMP136]], align 4
+// CK12-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP135]], ptr [[TMP138]], align 4
+// CK12-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK12-NEXT:    store ptr [[TMP135]], ptr [[TMP139]], align 4
+// CK12-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i32 0, i32 0
+// CK12-NEXT:    store ptr null, ptr [[TMP140]], align 4
+// CK12-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 1
+// CK12-NEXT:    store ptr [[TMP137]], ptr [[TMP141]], align 4
+// CK12-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 1
+// CK12-NEXT:    store ptr [[TMP137]], ptr [[TMP142]], align 4
+// CK12-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i32 0, i32 1
+// CK12-NEXT:    store ptr null, ptr [[TMP143]], align 4
+// CK12-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK12-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK12-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
+// CK12-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK12-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
+// CK12-NEXT:    store i32 2, ptr [[TMP147]], align 4
+// CK12-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
+// CK12-NEXT:    store ptr [[TMP144]], ptr [[TMP148]], align 4
+// CK12-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 3
+// CK12-NEXT:    store ptr [[TMP145]], ptr [[TMP149]], align 4
+// CK12-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 4
+// CK12-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP150]], align 4
+// CK12-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 5
+// CK12-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP151]], align 4
+// CK12-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 6
+// CK12-NEXT:    store ptr null, ptr [[TMP152]], align 4
+// CK12-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 7
+// CK12-NEXT:    store ptr null, ptr [[TMP153]], align 4
+// CK12-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 8
+// CK12-NEXT:    store i64 0, ptr [[TMP154]], align 8
+// CK12-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 9
+// CK12-NEXT:    store i64 0, ptr [[TMP155]], align 8
+// CK12-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 10
+// CK12-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP156]], align 4
+// CK12-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 11
+// CK12-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP157]], align 4
+// CK12-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 12
+// CK12-NEXT:    store i32 0, ptr [[TMP158]], align 4
+// CK12-NEXT:    [[TMP159:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74.region_id, ptr [[KERNEL_ARGS38]])
+// CK12-NEXT:    [[TMP160:%.*]] = icmp ne i32 [[TMP159]], 0
+// CK12-NEXT:    br i1 [[TMP160]], label [[OMP_OFFLOAD_FAILED39:%.*]], label [[OMP_OFFLOAD_CONT40:%.*]]
+// CK12:       omp_offload.failed39:
+// CK12-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74(ptr [[TMP135]], ptr [[TMP137]]) #[[ATTR2]]
+// CK12-NEXT:    br label [[OMP_OFFLOAD_CONT40]]
+// CK12:       omp_offload.cont40:
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37
+// CK12-SAME: (ptr noundef [[G:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43
+// CK12-SAME: (ptr noundef [[L:%.*]]) #[[ATTR1]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49
+// CK12-SAME: (ptr noundef [[T:%.*]]) #[[ATTR1]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55
+// CK12-SAME: (ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61
+// CK12-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67
+// CK12-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74
+// CK12-SAME: (ptr noundef [[TR:%.*]], ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 4
+// CK12-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK12-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK12-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4
+// CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4
+// CK12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
+// CK12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK12-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4
+// CK12-NEXT:    ret void
+//
+//
+// CK12-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK12-SAME: () #[[ATTR3:[0-9]+]] {
+// CK12-NEXT:  entry:
+// CK12-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK12-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// CK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CK13-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK13-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// CK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[TR:%.*]]) #[[ATTR0]] comdat {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[L:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[T:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK13-NEXT:    [[DOTOFFLOAD_BASEPTRS1:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_PTRS2:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_MAPPERS3:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK13-NEXT:    [[DOTOFFLOAD_BASEPTRS7:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_PTRS8:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_MAPPERS9:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[KERNEL_ARGS10:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK13-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[DOTOFFLOAD_BASEPTRS13:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_PTRS14:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_MAPPERS15:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[KERNEL_ARGS16:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK13-NEXT:    [[_TMP19:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[DOTOFFLOAD_BASEPTRS20:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_PTRS21:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_MAPPERS22:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[KERNEL_ARGS23:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK13-NEXT:    [[_TMP26:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [1 x ptr], align 4
+// CK13-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK13-NEXT:    [[_TMP33:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[_TMP34:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[DOTOFFLOAD_BASEPTRS35:%.*]] = alloca [2 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_PTRS36:%.*]] = alloca [2 x ptr], align 4
+// CK13-NEXT:    [[DOTOFFLOAD_MAPPERS37:%.*]] = alloca [2 x ptr], align 4
+// CK13-NEXT:    [[KERNEL_ARGS38:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK13-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
+// CK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
+// CK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
+// CK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK13-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK13-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK13-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK13-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK13-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK13-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK13-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK13-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK13-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK13-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK13-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK13-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK13-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK13-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK13-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK13-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK13-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK13-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK13-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK13-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK13-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK13-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK13-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37.region_id, ptr [[KERNEL_ARGS]])
+// CK13-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK13-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK13:       omp_offload.failed:
+// CK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK13:       omp_offload.cont:
+// CK13-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[L]], align 4
+// CK13-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP21]], ptr [[TMP22]], align 4
+// CK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP21]], ptr [[TMP23]], align 4
+// CK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS3]], i32 0, i32 0
+// CK13-NEXT:    store ptr null, ptr [[TMP24]], align 4
+// CK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
+// CK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
+// CK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
+// CK13-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK13-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
+// CK13-NEXT:    store i32 1, ptr [[TMP28]], align 4
+// CK13-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
+// CK13-NEXT:    store ptr [[TMP25]], ptr [[TMP29]], align 4
+// CK13-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3
+// CK13-NEXT:    store ptr [[TMP26]], ptr [[TMP30]], align 4
+// CK13-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4
+// CK13-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP31]], align 4
+// CK13-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5
+// CK13-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP32]], align 4
+// CK13-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6
+// CK13-NEXT:    store ptr null, ptr [[TMP33]], align 4
+// CK13-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7
+// CK13-NEXT:    store ptr null, ptr [[TMP34]], align 4
+// CK13-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8
+// CK13-NEXT:    store i64 0, ptr [[TMP35]], align 8
+// CK13-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9
+// CK13-NEXT:    store i64 0, ptr [[TMP36]], align 8
+// CK13-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10
+// CK13-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP37]], align 4
+// CK13-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11
+// CK13-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CK13-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12
+// CK13-NEXT:    store i32 0, ptr [[TMP39]], align 4
+// CK13-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43.region_id, ptr [[KERNEL_ARGS4]])
+// CK13-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CK13-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
+// CK13:       omp_offload.failed5:
+// CK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43(ptr [[TMP21]]) #[[ATTR2]]
+// CK13-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
+// CK13:       omp_offload.cont6:
+// CK13-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[T]], align 4
+// CK13-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP42]], ptr [[TMP43]], align 4
+// CK13-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP42]], ptr [[TMP44]], align 4
+// CK13-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS9]], i32 0, i32 0
+// CK13-NEXT:    store ptr null, ptr [[TMP45]], align 4
+// CK13-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
+// CK13-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
+// CK13-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
+// CK13-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
+// CK13-NEXT:    store i32 1, ptr [[TMP49]], align 4
+// CK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
+// CK13-NEXT:    store ptr [[TMP46]], ptr [[TMP50]], align 4
+// CK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 3
+// CK13-NEXT:    store ptr [[TMP47]], ptr [[TMP51]], align 4
+// CK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 4
+// CK13-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP52]], align 4
+// CK13-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 5
+// CK13-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP53]], align 4
+// CK13-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 6
+// CK13-NEXT:    store ptr null, ptr [[TMP54]], align 4
+// CK13-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 7
+// CK13-NEXT:    store ptr null, ptr [[TMP55]], align 4
+// CK13-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 8
+// CK13-NEXT:    store i64 0, ptr [[TMP56]], align 8
+// CK13-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 9
+// CK13-NEXT:    store i64 0, ptr [[TMP57]], align 8
+// CK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 10
+// CK13-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP58]], align 4
+// CK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 11
+// CK13-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP59]], align 4
+// CK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 12
+// CK13-NEXT:    store i32 0, ptr [[TMP60]], align 4
+// CK13-NEXT:    [[TMP61:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49.region_id, ptr [[KERNEL_ARGS10]])
+// CK13-NEXT:    [[TMP62:%.*]] = icmp ne i32 [[TMP61]], 0
+// CK13-NEXT:    br i1 [[TMP62]], label [[OMP_OFFLOAD_FAILED11:%.*]], label [[OMP_OFFLOAD_CONT12:%.*]]
+// CK13:       omp_offload.failed11:
+// CK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49(ptr [[TMP42]]) #[[ATTR2]]
+// CK13-NEXT:    br label [[OMP_OFFLOAD_CONT12]]
+// CK13:       omp_offload.cont12:
+// CK13-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TMP63]], ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP64]], align 4
+// CK13-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP65]], ptr [[TMP66]], align 4
+// CK13-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP65]], ptr [[TMP67]], align 4
+// CK13-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS15]], i32 0, i32 0
+// CK13-NEXT:    store ptr null, ptr [[TMP68]], align 4
+// CK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
+// CK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
+// CK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
+// CK13-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
+// CK13-NEXT:    store i32 1, ptr [[TMP72]], align 4
+// CK13-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
+// CK13-NEXT:    store ptr [[TMP69]], ptr [[TMP73]], align 4
+// CK13-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 3
+// CK13-NEXT:    store ptr [[TMP70]], ptr [[TMP74]], align 4
+// CK13-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 4
+// CK13-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP75]], align 4
+// CK13-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 5
+// CK13-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP76]], align 4
+// CK13-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 6
+// CK13-NEXT:    store ptr null, ptr [[TMP77]], align 4
+// CK13-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 7
+// CK13-NEXT:    store ptr null, ptr [[TMP78]], align 4
+// CK13-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 8
+// CK13-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK13-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 9
+// CK13-NEXT:    store i64 0, ptr [[TMP80]], align 8
+// CK13-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 10
+// CK13-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP81]], align 4
+// CK13-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 11
+// CK13-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP82]], align 4
+// CK13-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 12
+// CK13-NEXT:    store i32 0, ptr [[TMP83]], align 4
+// CK13-NEXT:    [[TMP84:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55.region_id, ptr [[KERNEL_ARGS16]])
+// CK13-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
+// CK13-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
+// CK13:       omp_offload.failed17:
+// CK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55(ptr [[TMP65]]) #[[ATTR2]]
+// CK13-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
+// CK13:       omp_offload.cont18:
+// CK13-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TMP86]], ptr [[_TMP19]], align 4
+// CK13-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[_TMP19]], align 4
+// CK13-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP87]], align 4
+// CK13-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP88]], ptr [[TMP89]], align 4
+// CK13-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP88]], ptr [[TMP90]], align 4
+// CK13-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS22]], i32 0, i32 0
+// CK13-NEXT:    store ptr null, ptr [[TMP91]], align 4
+// CK13-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
+// CK13-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
+// CK13-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
+// CK13-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
+// CK13-NEXT:    store i32 1, ptr [[TMP95]], align 4
+// CK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
+// CK13-NEXT:    store ptr [[TMP92]], ptr [[TMP96]], align 4
+// CK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 3
+// CK13-NEXT:    store ptr [[TMP93]], ptr [[TMP97]], align 4
+// CK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 4
+// CK13-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP98]], align 4
+// CK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 5
+// CK13-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP99]], align 4
+// CK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 6
+// CK13-NEXT:    store ptr null, ptr [[TMP100]], align 4
+// CK13-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 7
+// CK13-NEXT:    store ptr null, ptr [[TMP101]], align 4
+// CK13-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 8
+// CK13-NEXT:    store i64 0, ptr [[TMP102]], align 8
+// CK13-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 9
+// CK13-NEXT:    store i64 0, ptr [[TMP103]], align 8
+// CK13-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 10
+// CK13-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP104]], align 4
+// CK13-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 11
+// CK13-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP105]], align 4
+// CK13-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 12
+// CK13-NEXT:    store i32 0, ptr [[TMP106]], align 4
+// CK13-NEXT:    [[TMP107:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61.region_id, ptr [[KERNEL_ARGS23]])
+// CK13-NEXT:    [[TMP108:%.*]] = icmp ne i32 [[TMP107]], 0
+// CK13-NEXT:    br i1 [[TMP108]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
+// CK13:       omp_offload.failed24:
+// CK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61(ptr [[TMP88]]) #[[ATTR2]]
+// CK13-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
+// CK13:       omp_offload.cont25:
+// CK13-NEXT:    [[TMP109:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TMP109]], ptr [[_TMP26]], align 4
+// CK13-NEXT:    [[TMP110:%.*]] = load ptr, ptr [[_TMP26]], align 4
+// CK13-NEXT:    [[TMP111:%.*]] = load ptr, ptr [[TMP110]], align 4
+// CK13-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP111]], ptr [[TMP112]], align 4
+// CK13-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP111]], ptr [[TMP113]], align 4
+// CK13-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 0
+// CK13-NEXT:    store ptr null, ptr [[TMP114]], align 4
+// CK13-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CK13-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CK13-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
+// CK13-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK13-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
+// CK13-NEXT:    store i32 1, ptr [[TMP118]], align 4
+// CK13-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
+// CK13-NEXT:    store ptr [[TMP115]], ptr [[TMP119]], align 4
+// CK13-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
+// CK13-NEXT:    store ptr [[TMP116]], ptr [[TMP120]], align 4
+// CK13-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
+// CK13-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP121]], align 4
+// CK13-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
+// CK13-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP122]], align 4
+// CK13-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
+// CK13-NEXT:    store ptr null, ptr [[TMP123]], align 4
+// CK13-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
+// CK13-NEXT:    store ptr null, ptr [[TMP124]], align 4
+// CK13-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
+// CK13-NEXT:    store i64 0, ptr [[TMP125]], align 8
+// CK13-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
+// CK13-NEXT:    store i64 0, ptr [[TMP126]], align 8
+// CK13-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
+// CK13-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP127]], align 4
+// CK13-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
+// CK13-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP128]], align 4
+// CK13-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
+// CK13-NEXT:    store i32 0, ptr [[TMP129]], align 4
+// CK13-NEXT:    [[TMP130:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67.region_id, ptr [[KERNEL_ARGS30]])
+// CK13-NEXT:    [[TMP131:%.*]] = icmp ne i32 [[TMP130]], 0
+// CK13-NEXT:    br i1 [[TMP131]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
+// CK13:       omp_offload.failed31:
+// CK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67(ptr [[TMP111]]) #[[ATTR2]]
+// CK13-NEXT:    br label [[OMP_OFFLOAD_CONT32]]
+// CK13:       omp_offload.cont32:
+// CK13-NEXT:    [[TMP132:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TMP132]], ptr [[_TMP33]], align 4
+// CK13-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TMP133]], ptr [[_TMP34]], align 4
+// CK13-NEXT:    [[TMP134:%.*]] = load ptr, ptr [[_TMP33]], align 4
+// CK13-NEXT:    [[TMP135:%.*]] = load ptr, ptr [[TMP134]], align 4
+// CK13-NEXT:    [[TMP136:%.*]] = load ptr, ptr [[_TMP34]], align 4
+// CK13-NEXT:    [[TMP137:%.*]] = load ptr, ptr [[TMP136]], align 4
+// CK13-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP135]], ptr [[TMP138]], align 4
+// CK13-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK13-NEXT:    store ptr [[TMP135]], ptr [[TMP139]], align 4
+// CK13-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i32 0, i32 0
+// CK13-NEXT:    store ptr null, ptr [[TMP140]], align 4
+// CK13-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 1
+// CK13-NEXT:    store ptr [[TMP137]], ptr [[TMP141]], align 4
+// CK13-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 1
+// CK13-NEXT:    store ptr [[TMP137]], ptr [[TMP142]], align 4
+// CK13-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i32 0, i32 1
+// CK13-NEXT:    store ptr null, ptr [[TMP143]], align 4
+// CK13-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CK13-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CK13-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
+// CK13-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK13-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
+// CK13-NEXT:    store i32 2, ptr [[TMP147]], align 4
+// CK13-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
+// CK13-NEXT:    store ptr [[TMP144]], ptr [[TMP148]], align 4
+// CK13-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 3
+// CK13-NEXT:    store ptr [[TMP145]], ptr [[TMP149]], align 4
+// CK13-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 4
+// CK13-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP150]], align 4
+// CK13-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 5
+// CK13-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP151]], align 4
+// CK13-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 6
+// CK13-NEXT:    store ptr null, ptr [[TMP152]], align 4
+// CK13-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 7
+// CK13-NEXT:    store ptr null, ptr [[TMP153]], align 4
+// CK13-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 8
+// CK13-NEXT:    store i64 0, ptr [[TMP154]], align 8
+// CK13-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 9
+// CK13-NEXT:    store i64 0, ptr [[TMP155]], align 8
+// CK13-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 10
+// CK13-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP156]], align 4
+// CK13-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 11
+// CK13-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP157]], align 4
+// CK13-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 12
+// CK13-NEXT:    store i32 0, ptr [[TMP158]], align 4
+// CK13-NEXT:    [[TMP159:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74.region_id, ptr [[KERNEL_ARGS38]])
+// CK13-NEXT:    [[TMP160:%.*]] = icmp ne i32 [[TMP159]], 0
+// CK13-NEXT:    br i1 [[TMP160]], label [[OMP_OFFLOAD_FAILED39:%.*]], label [[OMP_OFFLOAD_CONT40:%.*]]
+// CK13:       omp_offload.failed39:
+// CK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74(ptr [[TMP135]], ptr [[TMP137]]) #[[ATTR2]]
+// CK13-NEXT:    br label [[OMP_OFFLOAD_CONT40]]
+// CK13:       omp_offload.cont40:
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l37
+// CK13-SAME: (ptr noundef [[G:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l43
+// CK13-SAME: (ptr noundef [[L:%.*]]) #[[ATTR1]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l49
+// CK13-SAME: (ptr noundef [[T:%.*]]) #[[ATTR1]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l55
+// CK13-SAME: (ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l61
+// CK13-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l67
+// CK13-SAME: (ptr noundef [[TR:%.*]]) #[[ATTR1]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIiEvRPfRPT__l74
+// CK13-SAME: (ptr noundef [[TR:%.*]], ptr noundef [[LR:%.*]]) #[[ATTR1]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 4
+// CK13-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// CK13-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
+// CK13-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4
+// CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
+// CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
+// CK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4
+// CK13-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
+// CK13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK13-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4
+// CK13-NEXT:    ret void
+//
+//
+// CK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK13-SAME: () #[[ATTR3:[0-9]+]] {
+// CK13-NEXT:  entry:
+// CK13-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK13-NEXT:    ret void
+//
+//
+// SIMD-ONLY00-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// SIMD-ONLY00-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY00-NEXT:  entry:
+// SIMD-ONLY00-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP1]])
+// SIMD-ONLY00-NEXT:    ret void
+//
+//
+// SIMD-ONLY00-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// SIMD-ONLY00-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[LR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[TR:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY00-NEXT:  entry:
+// SIMD-ONLY00-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[L:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[T:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[_TMP4:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[_TMP8:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    [[_TMP9:%.*]] = alloca ptr, align 8
+// SIMD-ONLY00-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 8
+// SIMD-ONLY00-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 8
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 8
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    store ptr [[TMP16]], ptr [[_TMP9]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 8
+// SIMD-ONLY00-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 8
+// SIMD-ONLY00-NEXT:    ret void
+//
+//
+// SIMD-ONLY01-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// SIMD-ONLY01-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY01-NEXT:  entry:
+// SIMD-ONLY01-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP1]])
+// SIMD-ONLY01-NEXT:    ret void
+//
+//
+// SIMD-ONLY01-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// SIMD-ONLY01-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[LR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[TR:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY01-NEXT:  entry:
+// SIMD-ONLY01-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[L:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[T:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[_TMP4:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[_TMP8:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    [[_TMP9:%.*]] = alloca ptr, align 8
+// SIMD-ONLY01-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 8
+// SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 8
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 8
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    store ptr [[TMP16]], ptr [[_TMP9]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 8
+// SIMD-ONLY01-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 8
+// SIMD-ONLY01-NEXT:    ret void
+//
+//
+// SIMD-ONLY02-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// SIMD-ONLY02-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY02-NEXT:  entry:
+// SIMD-ONLY02-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+// SIMD-ONLY02-NEXT:    ret void
+//
+//
+// SIMD-ONLY02-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// SIMD-ONLY02-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[TR:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY02-NEXT:  entry:
+// SIMD-ONLY02-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[L:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[T:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[_TMP4:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[_TMP8:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    [[_TMP9:%.*]] = alloca ptr, align 4
+// SIMD-ONLY02-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 4
+// SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 4
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 4
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    store ptr [[TMP16]], ptr [[_TMP9]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 4
+// SIMD-ONLY02-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 4
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 4
+// SIMD-ONLY02-NEXT:    ret void
+//
+//
+// SIMD-ONLY03-LABEL: define {{[^@]+}}@_Z3barRPfRPi
+// SIMD-ONLY03-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY03-NEXT:  entry:
+// SIMD-ONLY03-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    call void @_Z3fooIiEvRPfRPT_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+// SIMD-ONLY03-NEXT:    ret void
+//
+//
+// SIMD-ONLY03-LABEL: define {{[^@]+}}@_Z3fooIiEvRPfRPT_
+// SIMD-ONLY03-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[TR:%.*]]) #[[ATTR0]] comdat {
+// SIMD-ONLY03-NEXT:  entry:
+// SIMD-ONLY03-NEXT:    [[LR_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[TR_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[L:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[T:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[_TMP4:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[_TMP8:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    [[_TMP9:%.*]] = alloca ptr, align 4
+// SIMD-ONLY03-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 4
+// SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 4
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 4
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    store ptr [[TMP16]], ptr [[_TMP9]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 4
+// SIMD-ONLY03-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 4
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 4
+// SIMD-ONLY03-NEXT:    ret void
+//
+//
+// CK20-LABEL: define {{[^@]+}}@_Z3barPd
+// CK20-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK20-NEXT:  entry:
+// CK20-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 8
+// CK20-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// CK20-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// CK20-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
+// CK20-NEXT:    ret void
+//
+//
+// CK20-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// CK20-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CK20-NEXT:  entry:
+// CK20-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK20-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK20-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[THIS1]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]])
+// CK20-NEXT:    ret void
+//
+//
+// CK20-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// CK20-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CK20-NEXT:  entry:
+// CK20-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    [[LA:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK20-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x ptr], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x ptr], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [2 x ptr], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8
+// CK20-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [3 x ptr], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [3 x ptr], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [3 x ptr], align 8
+// CK20-NEXT:    [[DOTOFFLOAD_SIZES13:%.*]] = alloca [3 x i64], align 8
+// CK20-NEXT:    [[KERNEL_ARGS14:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK20-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// CK20-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    store ptr null, ptr [[LA]], align 8
+// CK20-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK20-NEXT:    store ptr [[THIS1]], ptr [[TMP0]], align 8
+// CK20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK20-NEXT:    store ptr [[A]], ptr [[TMP1]], align 8
+// CK20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK20-NEXT:    store ptr null, ptr [[TMP2]], align 8
+// CK20-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK20-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK20-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK20-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK20-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK20-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK20-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CK20-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK20-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK20-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK20-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CK20-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK20-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CK20-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK20-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CK20-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK20-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK20-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK20-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK20-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK20-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK20-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK20-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK20-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK20-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK20-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK20-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK20-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112.region_id, ptr [[KERNEL_ARGS]])
+// CK20-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK20-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK20:       omp_offload.failed:
+// CK20-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CK20-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK20:       omp_offload.cont:
+// CK20-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[B]], align 8
+// CK20-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[B]], i32 1
+// CK20-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[TMP21]] to i64
+// CK20-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[B]] to i64
+// CK20-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP22]], [[TMP23]]
+// CK20-NEXT:    [[TMP25:%.*]] = sdiv exact i64 [[TMP24]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK20-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes.1, i64 16, i1 false)
+// CK20-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK20-NEXT:    store ptr [[THIS1]], ptr [[TMP26]], align 8
+// CK20-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK20-NEXT:    store ptr [[B]], ptr [[TMP27]], align 8
+// CK20-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK20-NEXT:    store i64 [[TMP25]], ptr [[TMP28]], align 8
+// CK20-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 0
+// CK20-NEXT:    store ptr null, ptr [[TMP29]], align 8
+// CK20-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CK20-NEXT:    store ptr [[THIS1]], ptr [[TMP30]], align 8
+// CK20-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CK20-NEXT:    store ptr [[TMP20]], ptr [[TMP31]], align 8
+// CK20-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 1
+// CK20-NEXT:    store ptr null, ptr [[TMP32]], align 8
+// CK20-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK20-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK20-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK20-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CK20-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK20-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CK20-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CK20-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CK20-NEXT:    store ptr [[TMP33]], ptr [[TMP38]], align 8
+// CK20-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CK20-NEXT:    store ptr [[TMP34]], ptr [[TMP39]], align 8
+// CK20-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CK20-NEXT:    store ptr [[TMP35]], ptr [[TMP40]], align 8
+// CK20-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CK20-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP41]], align 8
+// CK20-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CK20-NEXT:    store ptr null, ptr [[TMP42]], align 8
+// CK20-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CK20-NEXT:    store ptr null, ptr [[TMP43]], align 8
+// CK20-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CK20-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CK20-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CK20-NEXT:    store i64 0, ptr [[TMP45]], align 8
+// CK20-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CK20-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP46]], align 4
+// CK20-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CK20-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CK20-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CK20-NEXT:    store i32 0, ptr [[TMP48]], align 4
+// CK20-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118.region_id, ptr [[KERNEL_ARGS5]])
+// CK20-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CK20-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CK20:       omp_offload.failed6:
+// CK20-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]]
+// CK20-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CK20:       omp_offload.cont7:
+// CK20-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// CK20-NEXT:    [[B9:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK20-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[B9]], align 8
+// CK20-NEXT:    [[TMP52:%.*]] = getelementptr ptr, ptr [[B9]], i32 1
+// CK20-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64
+// CK20-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[A8]] to i64
+// CK20-NEXT:    [[TMP55:%.*]] = sub i64 [[TMP53]], [[TMP54]]
+// CK20-NEXT:    [[TMP56:%.*]] = sdiv exact i64 [[TMP55]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK20-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES13]], ptr align 8 @.offload_sizes.3, i64 24, i1 false)
+// CK20-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK20-NEXT:    store ptr [[THIS1]], ptr [[TMP57]], align 8
+// CK20-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK20-NEXT:    store ptr [[A8]], ptr [[TMP58]], align 8
+// CK20-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK20-NEXT:    store i64 [[TMP56]], ptr [[TMP59]], align 8
+// CK20-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 0
+// CK20-NEXT:    store ptr null, ptr [[TMP60]], align 8
+// CK20-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CK20-NEXT:    store ptr [[THIS1]], ptr [[TMP61]], align 8
+// CK20-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CK20-NEXT:    store ptr [[A8]], ptr [[TMP62]], align 8
+// CK20-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 1
+// CK20-NEXT:    store ptr null, ptr [[TMP63]], align 8
+// CK20-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 2
+// CK20-NEXT:    store ptr [[THIS1]], ptr [[TMP64]], align 8
+// CK20-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 2
+// CK20-NEXT:    store ptr [[TMP51]], ptr [[TMP65]], align 8
+// CK20-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 2
+// CK20-NEXT:    store ptr null, ptr [[TMP66]], align 8
+// CK20-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK20-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK20-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK20-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
+// CK20-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK20-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
+// CK20-NEXT:    store i32 3, ptr [[TMP71]], align 4
+// CK20-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
+// CK20-NEXT:    store ptr [[TMP67]], ptr [[TMP72]], align 8
+// CK20-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 3
+// CK20-NEXT:    store ptr [[TMP68]], ptr [[TMP73]], align 8
+// CK20-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 4
+// CK20-NEXT:    store ptr [[TMP69]], ptr [[TMP74]], align 8
+// CK20-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 5
+// CK20-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP75]], align 8
+// CK20-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 6
+// CK20-NEXT:    store ptr null, ptr [[TMP76]], align 8
+// CK20-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 7
+// CK20-NEXT:    store ptr null, ptr [[TMP77]], align 8
+// CK20-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 8
+// CK20-NEXT:    store i64 0, ptr [[TMP78]], align 8
+// CK20-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 9
+// CK20-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK20-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 10
+// CK20-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP80]], align 4
+// CK20-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 11
+// CK20-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP81]], align 4
+// CK20-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 12
+// CK20-NEXT:    store i32 0, ptr [[TMP82]], align 4
+// CK20-NEXT:    [[TMP83:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125.region_id, ptr [[KERNEL_ARGS14]])
+// CK20-NEXT:    [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0
+// CK20-NEXT:    br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
+// CK20:       omp_offload.failed15:
+// CK20-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]]
+// CK20-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
+// CK20:       omp_offload.cont16:
+// CK20-NEXT:    ret void
+//
+//
+// CK20-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// CK20-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CK20-NEXT:  entry:
+// CK20-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK20-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK20-NEXT:    store ptr null, ptr [[A]], align 8
+// CK20-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK20-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 8
+// CK20-NEXT:    ret void
+//
+//
+// CK20-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112
+// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CK20-NEXT:  entry:
+// CK20-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
+// CK20-NEXT:    ret void
+//
+//
+// CK20-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118
+// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK20-NEXT:  entry:
+// CK20-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
+// CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
+// CK20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
+// CK20-NEXT:    ret void
+//
+//
+// CK20-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125
+// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK20-NEXT:  entry:
+// CK20-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK20-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK20-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
+// CK20-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
+// CK20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 8
+// CK20-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CK20-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK20-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8
+// CK20-NEXT:    ret void
+//
+//
+// CK20-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK20-SAME: () #[[ATTR5:[0-9]+]] {
+// CK20-NEXT:  entry:
+// CK20-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK20-NEXT:    ret void
+//
+//
+// CK21-LABEL: define {{[^@]+}}@_Z3barPd
+// CK21-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK21-NEXT:  entry:
+// CK21-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 8
+// CK21-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// CK21-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// CK21-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
+// CK21-NEXT:    ret void
+//
+//
+// CK21-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// CK21-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CK21-NEXT:  entry:
+// CK21-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK21-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK21-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[THIS1]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]])
+// CK21-NEXT:    ret void
+//
+//
+// CK21-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// CK21-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CK21-NEXT:  entry:
+// CK21-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    [[LA:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK21-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x ptr], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x ptr], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [2 x ptr], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8
+// CK21-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [3 x ptr], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [3 x ptr], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [3 x ptr], align 8
+// CK21-NEXT:    [[DOTOFFLOAD_SIZES13:%.*]] = alloca [3 x i64], align 8
+// CK21-NEXT:    [[KERNEL_ARGS14:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK21-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// CK21-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    store ptr null, ptr [[LA]], align 8
+// CK21-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK21-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK21-NEXT:    store ptr [[THIS1]], ptr [[TMP0]], align 8
+// CK21-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK21-NEXT:    store ptr [[A]], ptr [[TMP1]], align 8
+// CK21-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK21-NEXT:    store ptr null, ptr [[TMP2]], align 8
+// CK21-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK21-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK21-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK21-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK21-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK21-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK21-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK21-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CK21-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK21-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK21-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK21-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CK21-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK21-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CK21-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK21-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CK21-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK21-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK21-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK21-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK21-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK21-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK21-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK21-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK21-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK21-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK21-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK21-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK21-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112.region_id, ptr [[KERNEL_ARGS]])
+// CK21-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK21-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK21:       omp_offload.failed:
+// CK21-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CK21-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK21:       omp_offload.cont:
+// CK21-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK21-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[B]], align 8
+// CK21-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[B]], i32 1
+// CK21-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[TMP21]] to i64
+// CK21-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[B]] to i64
+// CK21-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP22]], [[TMP23]]
+// CK21-NEXT:    [[TMP25:%.*]] = sdiv exact i64 [[TMP24]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK21-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes.1, i64 16, i1 false)
+// CK21-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK21-NEXT:    store ptr [[THIS1]], ptr [[TMP26]], align 8
+// CK21-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK21-NEXT:    store ptr [[B]], ptr [[TMP27]], align 8
+// CK21-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK21-NEXT:    store i64 [[TMP25]], ptr [[TMP28]], align 8
+// CK21-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 0
+// CK21-NEXT:    store ptr null, ptr [[TMP29]], align 8
+// CK21-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CK21-NEXT:    store ptr [[THIS1]], ptr [[TMP30]], align 8
+// CK21-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CK21-NEXT:    store ptr [[TMP20]], ptr [[TMP31]], align 8
+// CK21-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 1
+// CK21-NEXT:    store ptr null, ptr [[TMP32]], align 8
+// CK21-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK21-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK21-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK21-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CK21-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK21-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CK21-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CK21-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CK21-NEXT:    store ptr [[TMP33]], ptr [[TMP38]], align 8
+// CK21-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CK21-NEXT:    store ptr [[TMP34]], ptr [[TMP39]], align 8
+// CK21-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CK21-NEXT:    store ptr [[TMP35]], ptr [[TMP40]], align 8
+// CK21-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CK21-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP41]], align 8
+// CK21-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CK21-NEXT:    store ptr null, ptr [[TMP42]], align 8
+// CK21-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CK21-NEXT:    store ptr null, ptr [[TMP43]], align 8
+// CK21-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CK21-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CK21-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CK21-NEXT:    store i64 0, ptr [[TMP45]], align 8
+// CK21-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CK21-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP46]], align 4
+// CK21-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CK21-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CK21-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CK21-NEXT:    store i32 0, ptr [[TMP48]], align 4
+// CK21-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118.region_id, ptr [[KERNEL_ARGS5]])
+// CK21-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CK21-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CK21:       omp_offload.failed6:
+// CK21-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]]
+// CK21-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CK21:       omp_offload.cont7:
+// CK21-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// CK21-NEXT:    [[B9:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK21-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[B9]], align 8
+// CK21-NEXT:    [[TMP52:%.*]] = getelementptr ptr, ptr [[B9]], i32 1
+// CK21-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64
+// CK21-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[A8]] to i64
+// CK21-NEXT:    [[TMP55:%.*]] = sub i64 [[TMP53]], [[TMP54]]
+// CK21-NEXT:    [[TMP56:%.*]] = sdiv exact i64 [[TMP55]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK21-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES13]], ptr align 8 @.offload_sizes.3, i64 24, i1 false)
+// CK21-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK21-NEXT:    store ptr [[THIS1]], ptr [[TMP57]], align 8
+// CK21-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK21-NEXT:    store ptr [[A8]], ptr [[TMP58]], align 8
+// CK21-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK21-NEXT:    store i64 [[TMP56]], ptr [[TMP59]], align 8
+// CK21-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 0
+// CK21-NEXT:    store ptr null, ptr [[TMP60]], align 8
+// CK21-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CK21-NEXT:    store ptr [[THIS1]], ptr [[TMP61]], align 8
+// CK21-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CK21-NEXT:    store ptr [[A8]], ptr [[TMP62]], align 8
+// CK21-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 1
+// CK21-NEXT:    store ptr null, ptr [[TMP63]], align 8
+// CK21-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 2
+// CK21-NEXT:    store ptr [[THIS1]], ptr [[TMP64]], align 8
+// CK21-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 2
+// CK21-NEXT:    store ptr [[TMP51]], ptr [[TMP65]], align 8
+// CK21-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i64 0, i64 2
+// CK21-NEXT:    store ptr null, ptr [[TMP66]], align 8
+// CK21-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK21-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK21-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK21-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
+// CK21-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK21-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
+// CK21-NEXT:    store i32 3, ptr [[TMP71]], align 4
+// CK21-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
+// CK21-NEXT:    store ptr [[TMP67]], ptr [[TMP72]], align 8
+// CK21-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 3
+// CK21-NEXT:    store ptr [[TMP68]], ptr [[TMP73]], align 8
+// CK21-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 4
+// CK21-NEXT:    store ptr [[TMP69]], ptr [[TMP74]], align 8
+// CK21-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 5
+// CK21-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP75]], align 8
+// CK21-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 6
+// CK21-NEXT:    store ptr null, ptr [[TMP76]], align 8
+// CK21-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 7
+// CK21-NEXT:    store ptr null, ptr [[TMP77]], align 8
+// CK21-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 8
+// CK21-NEXT:    store i64 0, ptr [[TMP78]], align 8
+// CK21-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 9
+// CK21-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK21-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 10
+// CK21-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP80]], align 4
+// CK21-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 11
+// CK21-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP81]], align 4
+// CK21-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 12
+// CK21-NEXT:    store i32 0, ptr [[TMP82]], align 4
+// CK21-NEXT:    [[TMP83:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125.region_id, ptr [[KERNEL_ARGS14]])
+// CK21-NEXT:    [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0
+// CK21-NEXT:    br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
+// CK21:       omp_offload.failed15:
+// CK21-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]]
+// CK21-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
+// CK21:       omp_offload.cont16:
+// CK21-NEXT:    ret void
+//
+//
+// CK21-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// CK21-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CK21-NEXT:  entry:
+// CK21-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CK21-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK21-NEXT:    store ptr null, ptr [[A]], align 8
+// CK21-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CK21-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 8
+// CK21-NEXT:    ret void
+//
+//
+// CK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112
+// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CK21-NEXT:  entry:
+// CK21-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
+// CK21-NEXT:    ret void
+//
+//
+// CK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118
+// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK21-NEXT:  entry:
+// CK21-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
+// CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
+// CK21-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
+// CK21-NEXT:    ret void
+//
+//
+// CK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125
+// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK21-NEXT:  entry:
+// CK21-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CK21-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CK21-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
+// CK21-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
+// CK21-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 8
+// CK21-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CK21-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK21-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8
+// CK21-NEXT:    ret void
+//
+//
+// CK21-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK21-SAME: () #[[ATTR5:[0-9]+]] {
+// CK21-NEXT:  entry:
+// CK21-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK21-NEXT:    ret void
+//
+//
+// CK22-LABEL: define {{[^@]+}}@_Z3barPd
+// CK22-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK22-NEXT:  entry:
+// CK22-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
+// CK22-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// CK22-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// CK22-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
+// CK22-NEXT:    ret void
+//
+//
+// CK22-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// CK22-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CK22-NEXT:  entry:
+// CK22-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK22-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK22-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]])
+// CK22-NEXT:    ret void
+//
+//
+// CK22-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// CK22-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CK22-NEXT:  entry:
+// CK22-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    [[LA:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK22-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK22-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK22-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK22-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x ptr], align 4
+// CK22-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x ptr], align 4
+// CK22-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [2 x ptr], align 4
+// CK22-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 4
+// CK22-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK22-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [3 x ptr], align 4
+// CK22-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [3 x ptr], align 4
+// CK22-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [3 x ptr], align 4
+// CK22-NEXT:    [[DOTOFFLOAD_SIZES13:%.*]] = alloca [3 x i64], align 4
+// CK22-NEXT:    [[KERNEL_ARGS14:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK22-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// CK22-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    store ptr null, ptr [[LA]], align 4
+// CK22-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK22-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK22-NEXT:    store ptr [[THIS1]], ptr [[TMP0]], align 4
+// CK22-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK22-NEXT:    store ptr [[A]], ptr [[TMP1]], align 4
+// CK22-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK22-NEXT:    store ptr null, ptr [[TMP2]], align 4
+// CK22-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK22-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK22-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK22-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK22-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK22-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK22-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK22-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CK22-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK22-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK22-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK22-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 4
+// CK22-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK22-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 4
+// CK22-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK22-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CK22-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK22-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK22-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK22-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK22-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK22-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK22-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK22-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK22-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK22-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK22-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK22-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK22-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112.region_id, ptr [[KERNEL_ARGS]])
+// CK22-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK22-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK22:       omp_offload.failed:
+// CK22-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CK22-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK22:       omp_offload.cont:
+// CK22-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK22-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[B]], align 4
+// CK22-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[B]], i32 1
+// CK22-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[TMP21]] to i64
+// CK22-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[B]] to i64
+// CK22-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP22]], [[TMP23]]
+// CK22-NEXT:    [[TMP25:%.*]] = sdiv exact i64 [[TMP24]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK22-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes.1, i32 16, i1 false)
+// CK22-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK22-NEXT:    store ptr [[THIS1]], ptr [[TMP26]], align 4
+// CK22-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK22-NEXT:    store ptr [[B]], ptr [[TMP27]], align 4
+// CK22-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK22-NEXT:    store i64 [[TMP25]], ptr [[TMP28]], align 4
+// CK22-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CK22-NEXT:    store ptr null, ptr [[TMP29]], align 4
+// CK22-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CK22-NEXT:    store ptr [[THIS1]], ptr [[TMP30]], align 4
+// CK22-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CK22-NEXT:    store ptr [[TMP20]], ptr [[TMP31]], align 4
+// CK22-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 1
+// CK22-NEXT:    store ptr null, ptr [[TMP32]], align 4
+// CK22-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK22-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK22-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK22-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CK22-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK22-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CK22-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CK22-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CK22-NEXT:    store ptr [[TMP33]], ptr [[TMP38]], align 4
+// CK22-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CK22-NEXT:    store ptr [[TMP34]], ptr [[TMP39]], align 4
+// CK22-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CK22-NEXT:    store ptr [[TMP35]], ptr [[TMP40]], align 4
+// CK22-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CK22-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP41]], align 4
+// CK22-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CK22-NEXT:    store ptr null, ptr [[TMP42]], align 4
+// CK22-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CK22-NEXT:    store ptr null, ptr [[TMP43]], align 4
+// CK22-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CK22-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CK22-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CK22-NEXT:    store i64 0, ptr [[TMP45]], align 8
+// CK22-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CK22-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP46]], align 4
+// CK22-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CK22-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CK22-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CK22-NEXT:    store i32 0, ptr [[TMP48]], align 4
+// CK22-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118.region_id, ptr [[KERNEL_ARGS5]])
+// CK22-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CK22-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CK22:       omp_offload.failed6:
+// CK22-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]]
+// CK22-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CK22:       omp_offload.cont7:
+// CK22-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// CK22-NEXT:    [[B9:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK22-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[B9]], align 4
+// CK22-NEXT:    [[TMP52:%.*]] = getelementptr ptr, ptr [[B9]], i32 1
+// CK22-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64
+// CK22-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[A8]] to i64
+// CK22-NEXT:    [[TMP55:%.*]] = sub i64 [[TMP53]], [[TMP54]]
+// CK22-NEXT:    [[TMP56:%.*]] = sdiv exact i64 [[TMP55]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK22-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES13]], ptr align 4 @.offload_sizes.3, i32 24, i1 false)
+// CK22-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK22-NEXT:    store ptr [[THIS1]], ptr [[TMP57]], align 4
+// CK22-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK22-NEXT:    store ptr [[A8]], ptr [[TMP58]], align 4
+// CK22-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK22-NEXT:    store i64 [[TMP56]], ptr [[TMP59]], align 4
+// CK22-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 0
+// CK22-NEXT:    store ptr null, ptr [[TMP60]], align 4
+// CK22-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CK22-NEXT:    store ptr [[THIS1]], ptr [[TMP61]], align 4
+// CK22-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CK22-NEXT:    store ptr [[A8]], ptr [[TMP62]], align 4
+// CK22-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 1
+// CK22-NEXT:    store ptr null, ptr [[TMP63]], align 4
+// CK22-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 2
+// CK22-NEXT:    store ptr [[THIS1]], ptr [[TMP64]], align 4
+// CK22-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 2
+// CK22-NEXT:    store ptr [[TMP51]], ptr [[TMP65]], align 4
+// CK22-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 2
+// CK22-NEXT:    store ptr null, ptr [[TMP66]], align 4
+// CK22-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK22-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK22-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK22-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
+// CK22-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK22-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
+// CK22-NEXT:    store i32 3, ptr [[TMP71]], align 4
+// CK22-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
+// CK22-NEXT:    store ptr [[TMP67]], ptr [[TMP72]], align 4
+// CK22-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 3
+// CK22-NEXT:    store ptr [[TMP68]], ptr [[TMP73]], align 4
+// CK22-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 4
+// CK22-NEXT:    store ptr [[TMP69]], ptr [[TMP74]], align 4
+// CK22-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 5
+// CK22-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP75]], align 4
+// CK22-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 6
+// CK22-NEXT:    store ptr null, ptr [[TMP76]], align 4
+// CK22-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 7
+// CK22-NEXT:    store ptr null, ptr [[TMP77]], align 4
+// CK22-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 8
+// CK22-NEXT:    store i64 0, ptr [[TMP78]], align 8
+// CK22-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 9
+// CK22-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK22-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 10
+// CK22-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP80]], align 4
+// CK22-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 11
+// CK22-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP81]], align 4
+// CK22-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 12
+// CK22-NEXT:    store i32 0, ptr [[TMP82]], align 4
+// CK22-NEXT:    [[TMP83:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125.region_id, ptr [[KERNEL_ARGS14]])
+// CK22-NEXT:    [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0
+// CK22-NEXT:    br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
+// CK22:       omp_offload.failed15:
+// CK22-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]]
+// CK22-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
+// CK22:       omp_offload.cont16:
+// CK22-NEXT:    ret void
+//
+//
+// CK22-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// CK22-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CK22-NEXT:  entry:
+// CK22-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK22-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK22-NEXT:    store ptr null, ptr [[A]], align 4
+// CK22-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK22-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 4
+// CK22-NEXT:    ret void
+//
+//
+// CK22-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112
+// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CK22-NEXT:  entry:
+// CK22-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
+// CK22-NEXT:    ret void
+//
+//
+// CK22-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118
+// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK22-NEXT:  entry:
+// CK22-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
+// CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
+// CK22-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4
+// CK22-NEXT:    ret void
+//
+//
+// CK22-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125
+// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK22-NEXT:  entry:
+// CK22-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK22-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK22-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
+// CK22-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
+// CK22-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 4
+// CK22-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
+// CK22-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK22-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4
+// CK22-NEXT:    ret void
+//
+//
+// CK22-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK22-SAME: () #[[ATTR5:[0-9]+]] {
+// CK22-NEXT:  entry:
+// CK22-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK22-NEXT:    ret void
+//
+//
+// CK23-LABEL: define {{[^@]+}}@_Z3barPd
+// CK23-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK23-NEXT:  entry:
+// CK23-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
+// CK23-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// CK23-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// CK23-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
+// CK23-NEXT:    ret void
+//
+//
+// CK23-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// CK23-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CK23-NEXT:  entry:
+// CK23-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK23-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK23-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]])
+// CK23-NEXT:    ret void
+//
+//
+// CK23-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// CK23-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CK23-NEXT:  entry:
+// CK23-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    [[LA:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK23-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK23-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK23-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK23-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x ptr], align 4
+// CK23-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x ptr], align 4
+// CK23-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [2 x ptr], align 4
+// CK23-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 4
+// CK23-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK23-NEXT:    [[DOTOFFLOAD_BASEPTRS10:%.*]] = alloca [3 x ptr], align 4
+// CK23-NEXT:    [[DOTOFFLOAD_PTRS11:%.*]] = alloca [3 x ptr], align 4
+// CK23-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [3 x ptr], align 4
+// CK23-NEXT:    [[DOTOFFLOAD_SIZES13:%.*]] = alloca [3 x i64], align 4
+// CK23-NEXT:    [[KERNEL_ARGS14:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CK23-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// CK23-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    store ptr null, ptr [[LA]], align 4
+// CK23-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK23-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK23-NEXT:    store ptr [[THIS1]], ptr [[TMP0]], align 4
+// CK23-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK23-NEXT:    store ptr [[A]], ptr [[TMP1]], align 4
+// CK23-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK23-NEXT:    store ptr null, ptr [[TMP2]], align 4
+// CK23-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK23-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK23-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK23-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK23-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK23-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK23-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK23-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CK23-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK23-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK23-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK23-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 4
+// CK23-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK23-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 4
+// CK23-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK23-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CK23-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK23-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK23-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK23-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK23-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK23-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK23-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK23-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK23-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK23-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK23-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK23-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK23-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112.region_id, ptr [[KERNEL_ARGS]])
+// CK23-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK23-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK23:       omp_offload.failed:
+// CK23-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CK23-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK23:       omp_offload.cont:
+// CK23-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK23-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[B]], align 4
+// CK23-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[B]], i32 1
+// CK23-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[TMP21]] to i64
+// CK23-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[B]] to i64
+// CK23-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP22]], [[TMP23]]
+// CK23-NEXT:    [[TMP25:%.*]] = sdiv exact i64 [[TMP24]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK23-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes.1, i32 16, i1 false)
+// CK23-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK23-NEXT:    store ptr [[THIS1]], ptr [[TMP26]], align 4
+// CK23-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK23-NEXT:    store ptr [[B]], ptr [[TMP27]], align 4
+// CK23-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK23-NEXT:    store i64 [[TMP25]], ptr [[TMP28]], align 4
+// CK23-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CK23-NEXT:    store ptr null, ptr [[TMP29]], align 4
+// CK23-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 1
+// CK23-NEXT:    store ptr [[THIS1]], ptr [[TMP30]], align 4
+// CK23-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 1
+// CK23-NEXT:    store ptr [[TMP20]], ptr [[TMP31]], align 4
+// CK23-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 1
+// CK23-NEXT:    store ptr null, ptr [[TMP32]], align 4
+// CK23-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CK23-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CK23-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CK23-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CK23-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK23-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CK23-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CK23-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CK23-NEXT:    store ptr [[TMP33]], ptr [[TMP38]], align 4
+// CK23-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CK23-NEXT:    store ptr [[TMP34]], ptr [[TMP39]], align 4
+// CK23-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CK23-NEXT:    store ptr [[TMP35]], ptr [[TMP40]], align 4
+// CK23-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CK23-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP41]], align 4
+// CK23-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CK23-NEXT:    store ptr null, ptr [[TMP42]], align 4
+// CK23-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CK23-NEXT:    store ptr null, ptr [[TMP43]], align 4
+// CK23-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CK23-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CK23-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CK23-NEXT:    store i64 0, ptr [[TMP45]], align 8
+// CK23-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CK23-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP46]], align 4
+// CK23-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CK23-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CK23-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CK23-NEXT:    store i32 0, ptr [[TMP48]], align 4
+// CK23-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118.region_id, ptr [[KERNEL_ARGS5]])
+// CK23-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CK23-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CK23:       omp_offload.failed6:
+// CK23-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]]
+// CK23-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CK23:       omp_offload.cont7:
+// CK23-NEXT:    [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// CK23-NEXT:    [[B9:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK23-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[B9]], align 4
+// CK23-NEXT:    [[TMP52:%.*]] = getelementptr ptr, ptr [[B9]], i32 1
+// CK23-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64
+// CK23-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[A8]] to i64
+// CK23-NEXT:    [[TMP55:%.*]] = sub i64 [[TMP53]], [[TMP54]]
+// CK23-NEXT:    [[TMP56:%.*]] = sdiv exact i64 [[TMP55]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CK23-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES13]], ptr align 4 @.offload_sizes.3, i32 24, i1 false)
+// CK23-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK23-NEXT:    store ptr [[THIS1]], ptr [[TMP57]], align 4
+// CK23-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK23-NEXT:    store ptr [[A8]], ptr [[TMP58]], align 4
+// CK23-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK23-NEXT:    store i64 [[TMP56]], ptr [[TMP59]], align 4
+// CK23-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 0
+// CK23-NEXT:    store ptr null, ptr [[TMP60]], align 4
+// CK23-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 1
+// CK23-NEXT:    store ptr [[THIS1]], ptr [[TMP61]], align 4
+// CK23-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 1
+// CK23-NEXT:    store ptr [[A8]], ptr [[TMP62]], align 4
+// CK23-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 1
+// CK23-NEXT:    store ptr null, ptr [[TMP63]], align 4
+// CK23-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 2
+// CK23-NEXT:    store ptr [[THIS1]], ptr [[TMP64]], align 4
+// CK23-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 2
+// CK23-NEXT:    store ptr [[TMP51]], ptr [[TMP65]], align 4
+// CK23-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS12]], i32 0, i32 2
+// CK23-NEXT:    store ptr null, ptr [[TMP66]], align 4
+// CK23-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
+// CK23-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
+// CK23-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
+// CK23-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
+// CK23-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK23-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
+// CK23-NEXT:    store i32 3, ptr [[TMP71]], align 4
+// CK23-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
+// CK23-NEXT:    store ptr [[TMP67]], ptr [[TMP72]], align 4
+// CK23-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 3
+// CK23-NEXT:    store ptr [[TMP68]], ptr [[TMP73]], align 4
+// CK23-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 4
+// CK23-NEXT:    store ptr [[TMP69]], ptr [[TMP74]], align 4
+// CK23-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 5
+// CK23-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP75]], align 4
+// CK23-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 6
+// CK23-NEXT:    store ptr null, ptr [[TMP76]], align 4
+// CK23-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 7
+// CK23-NEXT:    store ptr null, ptr [[TMP77]], align 4
+// CK23-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 8
+// CK23-NEXT:    store i64 0, ptr [[TMP78]], align 8
+// CK23-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 9
+// CK23-NEXT:    store i64 0, ptr [[TMP79]], align 8
+// CK23-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 10
+// CK23-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP80]], align 4
+// CK23-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 11
+// CK23-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP81]], align 4
+// CK23-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 12
+// CK23-NEXT:    store i32 0, ptr [[TMP82]], align 4
+// CK23-NEXT:    [[TMP83:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125.region_id, ptr [[KERNEL_ARGS14]])
+// CK23-NEXT:    [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0
+// CK23-NEXT:    br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
+// CK23:       omp_offload.failed15:
+// CK23-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]]
+// CK23-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
+// CK23:       omp_offload.cont16:
+// CK23-NEXT:    ret void
+//
+//
+// CK23-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// CK23-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CK23-NEXT:  entry:
+// CK23-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CK23-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CK23-NEXT:    store ptr null, ptr [[A]], align 4
+// CK23-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CK23-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 4
+// CK23-NEXT:    ret void
+//
+//
+// CK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112
+// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CK23-NEXT:  entry:
+// CK23-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
+// CK23-NEXT:    ret void
+//
+//
+// CK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118
+// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK23-NEXT:  entry:
+// CK23-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
+// CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
+// CK23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4
+// CK23-NEXT:    ret void
+//
+//
+// CK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125
+// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CK23-NEXT:  entry:
+// CK23-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CK23-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CK23-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
+// CK23-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
+// CK23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 4
+// CK23-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
+// CK23-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK23-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4
+// CK23-NEXT:    ret void
+//
+//
+// CK23-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK23-SAME: () #[[ATTR5:[0-9]+]] {
+// CK23-NEXT:  entry:
+// CK23-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK23-NEXT:    ret void
+//
+//
+// SIMD-ONLY10-LABEL: define {{[^@]+}}@_Z3barPd
+// SIMD-ONLY10-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY10-NEXT:  entry:
+// SIMD-ONLY10-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY10-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 8
+// SIMD-ONLY10-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// SIMD-ONLY10-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// SIMD-ONLY10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    ret void
+//
+//
+// SIMD-ONLY10-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// SIMD-ONLY10-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// SIMD-ONLY10-NEXT:  entry:
+// SIMD-ONLY10-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY10-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY10-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[THIS1]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]])
+// SIMD-ONLY10-NEXT:    ret void
+//
+//
+// SIMD-ONLY10-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// SIMD-ONLY10-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY10-NEXT:  entry:
+// SIMD-ONLY10-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY10-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY10-NEXT:    [[LA:%.*]] = alloca ptr, align 8
+// SIMD-ONLY10-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    store ptr null, ptr [[LA]], align 8
+// SIMD-ONLY10-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 8
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
+// SIMD-ONLY10-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
+// SIMD-ONLY10-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 8
+// SIMD-ONLY10-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY10-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 8
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 8
+// SIMD-ONLY10-NEXT:    [[B5:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY10-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 8
+// SIMD-ONLY10-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 8
+// SIMD-ONLY10-NEXT:    ret void
+//
+//
+// SIMD-ONLY10-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// SIMD-ONLY10-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY10-NEXT:  entry:
+// SIMD-ONLY10-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY10-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY10-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY10-NEXT:    store ptr null, ptr [[A]], align 8
+// SIMD-ONLY10-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SIMD-ONLY10-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 8
+// SIMD-ONLY10-NEXT:    ret void
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_Z3barPd
+// SIMD-ONLY11-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY11-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 8
+// SIMD-ONLY11-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// SIMD-ONLY11-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    ret void
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// SIMD-ONLY11-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[THIS1]], ptr noundef nonnull align 8 dereferenceable(8) [[TMP0]])
+// SIMD-ONLY11-NEXT:    ret void
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// SIMD-ONLY11-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY11-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY11-NEXT:    [[LA:%.*]] = alloca ptr, align 8
+// SIMD-ONLY11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    store ptr null, ptr [[LA]], align 8
+// SIMD-ONLY11-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 8
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
+// SIMD-ONLY11-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 8
+// SIMD-ONLY11-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 8
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 8
+// SIMD-ONLY11-NEXT:    [[B5:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 8
+// SIMD-ONLY11-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 8
+// SIMD-ONLY11-NEXT:    ret void
+//
+//
+// SIMD-ONLY11-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// SIMD-ONLY11-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY11-NEXT:  entry:
+// SIMD-ONLY11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY11-NEXT:    store ptr null, ptr [[A]], align 8
+// SIMD-ONLY11-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SIMD-ONLY11-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 8
+// SIMD-ONLY11-NEXT:    ret void
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_Z3barPd
+// SIMD-ONLY12-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY12-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
+// SIMD-ONLY12-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// SIMD-ONLY12-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    ret void
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// SIMD-ONLY12-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY12-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY12-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]])
+// SIMD-ONLY12-NEXT:    ret void
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// SIMD-ONLY12-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY12-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY12-NEXT:    [[LA:%.*]] = alloca ptr, align 4
+// SIMD-ONLY12-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store ptr null, ptr [[LA]], align 4
+// SIMD-ONLY12-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 4
+// SIMD-ONLY12-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 4
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 4
+// SIMD-ONLY12-NEXT:    [[B5:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY12-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 4
+// SIMD-ONLY12-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 4
+// SIMD-ONLY12-NEXT:    ret void
+//
+//
+// SIMD-ONLY12-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// SIMD-ONLY12-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY12-NEXT:  entry:
+// SIMD-ONLY12-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY12-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY12-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY12-NEXT:    store ptr null, ptr [[A]], align 4
+// SIMD-ONLY12-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// SIMD-ONLY12-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 4
+// SIMD-ONLY12-NEXT:    ret void
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_Z3barPd
+// SIMD-ONLY13-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY13-NEXT:    [[A:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
+// SIMD-ONLY13-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// SIMD-ONLY13-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    ret void
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd
+// SIMD-ONLY13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY13-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    call void @_ZN2STIdEC2ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]])
+// SIMD-ONLY13-NEXT:    ret void
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_ZN2STIdE3fooERPd
+// SIMD-ONLY13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY13-NEXT:    [[ARG_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY13-NEXT:    [[LA:%.*]] = alloca ptr, align 4
+// SIMD-ONLY13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store ptr [[ARG]], ptr [[ARG_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store ptr null, ptr [[LA]], align 4
+// SIMD-ONLY13-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 4
+// SIMD-ONLY13-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY13-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 4
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 4
+// SIMD-ONLY13-NEXT:    [[B5:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 4
+// SIMD-ONLY13-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 4
+// SIMD-ONLY13-NEXT:    ret void
+//
+//
+// SIMD-ONLY13-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd
+// SIMD-ONLY13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY13-NEXT:  entry:
+// SIMD-ONLY13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY13-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// SIMD-ONLY13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
+// SIMD-ONLY13-NEXT:    store ptr null, ptr [[A]], align 4
+// SIMD-ONLY13-NEXT:    [[B2:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
+// SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// SIMD-ONLY13-NEXT:    store ptr [[TMP0]], ptr [[B2]], align 4
+// SIMD-ONLY13-NEXT:    ret void
+//
+//
+// CK30-LABEL: define {{[^@]+}}@_Z3barv
+// CK30-SAME: () #[[ATTR0:[0-9]+]] {
+// CK30-NEXT:  entry:
+// CK30-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// CK30-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK30-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK30-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK30-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK30-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// CK30-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK30-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
+// CK30-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK30-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
+// CK30-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK30-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CK30-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK30-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK30-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK30-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK30-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK30-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK30-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK30-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK30-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK30-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK30-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK30-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 8
+// CK30-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK30-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 8
+// CK30-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK30-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK30-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK30-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK30-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK30-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK30-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK30-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK30-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK30-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK30-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK30-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK30-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK30-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK30-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159.region_id, ptr [[KERNEL_ARGS]])
+// CK30-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK30-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK30:       omp_offload.failed:
+// CK30-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK30-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK30:       omp_offload.cont:
+// CK30-NEXT:    ret void
+//
+//
+// CK30-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159
+// CK30-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK30-NEXT:  entry:
+// CK30-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CK30-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CK30-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CK30-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 8
+// CK30-NEXT:    ret void
+//
+//
+// CK30-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK30-SAME: () #[[ATTR3:[0-9]+]] {
+// CK30-NEXT:  entry:
+// CK30-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK30-NEXT:    ret void
+//
+//
+// CK31-LABEL: define {{[^@]+}}@_Z3barv
+// CK31-SAME: () #[[ATTR0:[0-9]+]] {
+// CK31-NEXT:  entry:
+// CK31-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// CK31-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK31-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK31-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK31-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK31-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// CK31-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK31-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
+// CK31-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK31-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
+// CK31-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK31-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CK31-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK31-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK31-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK31-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK31-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK31-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK31-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK31-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK31-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK31-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK31-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK31-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 8
+// CK31-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK31-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 8
+// CK31-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK31-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK31-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK31-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK31-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK31-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK31-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK31-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK31-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK31-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK31-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK31-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK31-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK31-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK31-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159.region_id, ptr [[KERNEL_ARGS]])
+// CK31-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK31-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK31:       omp_offload.failed:
+// CK31-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK31-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK31:       omp_offload.cont:
+// CK31-NEXT:    ret void
+//
+//
+// CK31-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159
+// CK31-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK31-NEXT:  entry:
+// CK31-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CK31-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CK31-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CK31-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 8
+// CK31-NEXT:    ret void
+//
+//
+// CK31-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK31-SAME: () #[[ATTR3:[0-9]+]] {
+// CK31-NEXT:  entry:
+// CK31-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK31-NEXT:    ret void
+//
+//
+// CK32-LABEL: define {{[^@]+}}@_Z3barv
+// CK32-SAME: () #[[ATTR0:[0-9]+]] {
+// CK32-NEXT:  entry:
+// CK32-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// CK32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// CK32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK32-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
+// CK32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK32-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
+// CK32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK32-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK32-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK32-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK32-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK32-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159.region_id, ptr [[KERNEL_ARGS]])
+// CK32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK32-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK32:       omp_offload.failed:
+// CK32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK32-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK32:       omp_offload.cont:
+// CK32-NEXT:    ret void
+//
+//
+// CK32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159
+// CK32-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK32-NEXT:  entry:
+// CK32-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 4
+// CK32-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
+// CK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
+// CK32-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 4
+// CK32-NEXT:    ret void
+//
+//
+// CK32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK32-SAME: () #[[ATTR3:[0-9]+]] {
+// CK32-NEXT:  entry:
+// CK32-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK32-NEXT:    ret void
+//
+//
+// CK33-LABEL: define {{[^@]+}}@_Z3barv
+// CK33-SAME: () #[[ATTR0:[0-9]+]] {
+// CK33-NEXT:  entry:
+// CK33-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// CK33-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK33-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK33-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK33-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK33-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// CK33-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK33-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
+// CK33-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK33-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
+// CK33-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK33-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK33-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK33-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK33-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK33-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK33-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK33-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK33-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK33-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK33-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK33-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK33-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK33-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK33-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK33-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK33-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK33-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK33-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK33-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK33-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK33-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK33-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK33-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK33-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK33-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK33-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK33-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK33-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK33-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK33-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159.region_id, ptr [[KERNEL_ARGS]])
+// CK33-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK33-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CK33:       omp_offload.failed:
+// CK33-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159(ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CK33-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CK33:       omp_offload.cont:
+// CK33-NEXT:    ret void
+//
+//
+// CK33-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l159
+// CK33-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK33-NEXT:  entry:
+// CK33-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 4
+// CK33-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
+// CK33-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
+// CK33-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 4
+// CK33-NEXT:    ret void
+//
+//
+// CK33-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CK33-SAME: () #[[ATTR3:[0-9]+]] {
+// CK33-NEXT:  entry:
+// CK33-NEXT:    call void @__tgt_register_requires(i64 1)
+// CK33-NEXT:    ret void
+//
+//
+// SIMD-ONLY20-LABEL: define {{[^@]+}}@_Z3barv
+// SIMD-ONLY20-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY20-NEXT:  entry:
+// SIMD-ONLY20-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// SIMD-ONLY20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// SIMD-ONLY20-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 8
+// SIMD-ONLY20-NEXT:    ret void
+//
+//
+// SIMD-ONLY21-LABEL: define {{[^@]+}}@_Z3barv
+// SIMD-ONLY21-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY21-NEXT:  entry:
+// SIMD-ONLY21-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// SIMD-ONLY21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// SIMD-ONLY21-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 8
+// SIMD-ONLY21-NEXT:    ret void
+//
+//
+// SIMD-ONLY22-LABEL: define {{[^@]+}}@_Z3barv
+// SIMD-ONLY22-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY22-NEXT:  entry:
+// SIMD-ONLY22-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// SIMD-ONLY22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// SIMD-ONLY22-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 4
+// SIMD-ONLY22-NEXT:    ret void
+//
+//
+// SIMD-ONLY23-LABEL: define {{[^@]+}}@_Z3barv
+// SIMD-ONLY23-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY23-NEXT:  entry:
+// SIMD-ONLY23-NEXT:    [[PTR:%.*]] = alloca ptr, align 64
+// SIMD-ONLY23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 64
+// SIMD-ONLY23-NEXT:    store double 0.000000e+00, ptr [[TMP0]], align 4
+// SIMD-ONLY23-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/teams_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
index c1617e16b402c..e8f9512114f7f 100644
--- a/clang/test/OpenMP/teams_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -168,46 +168,42 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l91
-// CHECK1-SAME: (ptr nonnull align 4 dereferenceable(4) [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (i64 [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[G_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[G1:%.*]] = alloca i32, align 128
+// CHECK1-NEXT:    [[G_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[G]], ptr [[G_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[G1]], align 128
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SIVAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[SIVAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[SIVAR_CASTED]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @.omp_outlined., ptr [[G1]], i64 [[TMP3]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @.omp_outlined., i64 [[TMP1]], i64 [[TMP3]])
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_outlined.
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[G_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[G1:%.*]] = alloca i32, align 128
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[G]], ptr [[G_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[G1]], align 128
-// CHECK1-NEXT:    store i32 1, ptr [[G1]], align 128
+// CHECK1-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
 // CHECK1-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
-// CHECK1-NEXT:    store ptr [[G1]], ptr [[TMP2]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK1-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[G_ADDR]], ptr [[TMP0]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
+// CHECK1-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP1]], align 8
 // CHECK1-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr nonnull align 8 dereferenceable(16) [[REF_TMP]])
 // CHECK1-NEXT:    ret void
 //
@@ -230,46 +226,42 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l91
-// CHECK3-SAME: (ptr nonnull align 4 dereferenceable(4) [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (i32 [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
 // CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[G_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[G1:%.*]] = alloca i32, align 128
+// CHECK3-NEXT:    [[G_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[G]], ptr [[G_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[G1]], align 128
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[G_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SIVAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[SIVAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SIVAR_CASTED]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @.omp_outlined., ptr [[G1]], i32 [[TMP3]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @.omp_outlined., i32 [[TMP1]], i32 [[TMP3]])
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_outlined.
-// CHECK3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
-// CHECK3-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[G_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[G1:%.*]] = alloca i32, align 128
 // CHECK3-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 4
 // CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
-// CHECK3-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[G]], ptr [[G_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[G1]], align 128
-// CHECK3-NEXT:    store i32 1, ptr [[G1]], align 128
+// CHECK3-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
 // CHECK3-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr [[G1]], ptr [[TMP2]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK3-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP3]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[G_ADDR]], ptr [[TMP0]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP1]], align 4
 // CHECK3-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr nonnull align 4 dereferenceable(8) [[REF_TMP]])
 // CHECK3-NEXT:    ret void
 //
@@ -637,14 +629,16 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 128
 // CHECK9-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 128
 // CHECK9-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 128
+// CHECK9-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK9-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK9-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [4 x ptr], align 8
 // CHECK9-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
-// CHECK9-NEXT:    [[DOTOFFLOAD_BASEPTRS1:%.*]] = alloca [1 x ptr], align 8
-// CHECK9-NEXT:    [[DOTOFFLOAD_PTRS2:%.*]] = alloca [1 x ptr], align 8
-// CHECK9-NEXT:    [[DOTOFFLOAD_MAPPERS3:%.*]] = alloca [1 x ptr], align 8
-// CHECK9-NEXT:    [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK9-NEXT:    [[T_VAR_CASTED1:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
@@ -653,121 +647,127 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 signext 2)
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 signext 3)
-// CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[TMP0]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[TMP1]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
-// CHECK9-NEXT:    store ptr null, ptr [[TMP2]], align 8
-// CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
-// CHECK9-NEXT:    store ptr [[VEC]], ptr [[TMP3]], align 8
-// CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
-// CHECK9-NEXT:    store ptr [[VEC]], ptr [[TMP4]], align 8
-// CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
-// CHECK9-NEXT:    store ptr null, ptr [[TMP5]], align 8
-// CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
-// CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[TMP6]], align 8
-// CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
-// CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[TMP7]], align 8
-// CHECK9-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
-// CHECK9-NEXT:    store ptr null, ptr [[TMP8]], align 8
-// CHECK9-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
-// CHECK9-NEXT:    store ptr [[VAR]], ptr [[TMP9]], align 8
-// CHECK9-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
-// CHECK9-NEXT:    store ptr [[VAR]], ptr [[TMP10]], align 8
-// CHECK9-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
-// CHECK9-NEXT:    store ptr null, ptr [[TMP11]], align 8
-// CHECK9-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP14]], align 4
-// CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
-// CHECK9-NEXT:    store i32 4, ptr [[TMP15]], align 4
-// CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
-// CHECK9-NEXT:    store ptr [[TMP12]], ptr [[TMP16]], align 8
-// CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
-// CHECK9-NEXT:    store ptr [[TMP13]], ptr [[TMP17]], align 8
-// CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK9-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP18]], align 8
-// CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK9-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP19]], align 8
-// CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
-// CHECK9-NEXT:    store ptr null, ptr [[TMP20]], align 8
-// CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
-// CHECK9-NEXT:    store ptr null, ptr [[TMP21]], align 8
-// CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
-// CHECK9-NEXT:    store i64 0, ptr [[TMP22]], align 8
-// CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
-// CHECK9-NEXT:    store i64 0, ptr [[TMP23]], align 8
-// CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
-// CHECK9-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4
-// CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
-// CHECK9-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4
-// CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
-// CHECK9-NEXT:    store i32 0, ptr [[TMP26]], align 4
-// CHECK9-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.region_id, ptr [[KERNEL_ARGS]])
-// CHECK9-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
-// CHECK9-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
+// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[T_VAR_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 8
+// CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK9-NEXT:    store ptr null, ptr [[TMP4]], align 8
+// CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK9-NEXT:    store ptr [[VEC]], ptr [[TMP5]], align 8
+// CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK9-NEXT:    store ptr [[VEC]], ptr [[TMP6]], align 8
+// CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK9-NEXT:    store ptr null, ptr [[TMP7]], align 8
+// CHECK9-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[TMP8]], align 8
+// CHECK9-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[TMP9]], align 8
+// CHECK9-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK9-NEXT:    store ptr null, ptr [[TMP10]], align 8
+// CHECK9-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK9-NEXT:    store ptr [[VAR]], ptr [[TMP11]], align 8
+// CHECK9-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK9-NEXT:    store ptr [[VAR]], ptr [[TMP12]], align 8
+// CHECK9-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK9-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CHECK9-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK9-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK9-NEXT:    store i32 4, ptr [[TMP17]], align 4
+// CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK9-NEXT:    store ptr [[TMP14]], ptr [[TMP18]], align 8
+// CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK9-NEXT:    store ptr [[TMP15]], ptr [[TMP19]], align 8
+// CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK9-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP20]], align 8
+// CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK9-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP21]], align 8
+// CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK9-NEXT:    store ptr null, ptr [[TMP22]], align 8
+// CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK9-NEXT:    store ptr null, ptr [[TMP23]], align 8
+// CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK9-NEXT:    store i64 0, ptr [[TMP24]], align 8
+// CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK9-NEXT:    store i64 0, ptr [[TMP25]], align 8
+// CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK9-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4
+// CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK9-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP27]], align 4
+// CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK9-NEXT:    store i32 0, ptr [[TMP28]], align 4
+// CHECK9-NEXT:    [[TMP29:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.region_id, ptr [[KERNEL_ARGS]])
+// CHECK9-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+// CHECK9-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i64 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
-// CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[TMP29]], align 8
-// CHECK9-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[TMP30]], align 8
-// CHECK9-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS3]], i64 0, i64 0
-// CHECK9-NEXT:    store ptr null, ptr [[TMP31]], align 8
-// CHECK9-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP34]], align 4
-// CHECK9-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
-// CHECK9-NEXT:    store i32 1, ptr [[TMP35]], align 4
-// CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
-// CHECK9-NEXT:    store ptr [[TMP32]], ptr [[TMP36]], align 8
-// CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3
-// CHECK9-NEXT:    store ptr [[TMP33]], ptr [[TMP37]], align 8
-// CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4
-// CHECK9-NEXT:    store ptr @.offload_sizes.8, ptr [[TMP38]], align 8
-// CHECK9-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5
-// CHECK9-NEXT:    store ptr @.offload_maptypes.9, ptr [[TMP39]], align 8
-// CHECK9-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6
-// CHECK9-NEXT:    store ptr null, ptr [[TMP40]], align 8
-// CHECK9-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7
-// CHECK9-NEXT:    store ptr null, ptr [[TMP41]], align 8
-// CHECK9-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8
-// CHECK9-NEXT:    store i64 0, ptr [[TMP42]], align 8
-// CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9
-// CHECK9-NEXT:    store i64 0, ptr [[TMP43]], align 8
-// CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10
-// CHECK9-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP44]], align 4
-// CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11
-// CHECK9-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP45]], align 4
-// CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12
-// CHECK9-NEXT:    store i32 0, ptr [[TMP46]], align 4
-// CHECK9-NEXT:    [[TMP47:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.region_id, ptr [[KERNEL_ARGS4]])
-// CHECK9-NEXT:    [[TMP48:%.*]] = icmp ne i32 [[TMP47]], 0
-// CHECK9-NEXT:    br i1 [[TMP48]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
-// CHECK9:       omp_offload.failed5:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81(ptr [[T_VAR]]) #[[ATTR4]]
-// CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
-// CHECK9:       omp_offload.cont6:
+// CHECK9-NEXT:    [[TMP31:%.*]] = load i32, ptr [[T_VAR]], align 128
+// CHECK9-NEXT:    store i32 [[TMP31]], ptr [[T_VAR_CASTED1]], align 4
+// CHECK9-NEXT:    [[TMP32:%.*]] = load i64, ptr [[T_VAR_CASTED1]], align 8
+// CHECK9-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK9-NEXT:    store i64 [[TMP32]], ptr [[TMP33]], align 8
+// CHECK9-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK9-NEXT:    store i64 [[TMP32]], ptr [[TMP34]], align 8
+// CHECK9-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 0
+// CHECK9-NEXT:    store ptr null, ptr [[TMP35]], align 8
+// CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK9-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK9-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK9-NEXT:    store i32 1, ptr [[TMP39]], align 4
+// CHECK9-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK9-NEXT:    store ptr [[TMP36]], ptr [[TMP40]], align 8
+// CHECK9-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK9-NEXT:    store ptr [[TMP37]], ptr [[TMP41]], align 8
+// CHECK9-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK9-NEXT:    store ptr @.offload_sizes.8, ptr [[TMP42]], align 8
+// CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK9-NEXT:    store ptr @.offload_maptypes.9, ptr [[TMP43]], align 8
+// CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK9-NEXT:    store ptr null, ptr [[TMP44]], align 8
+// CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK9-NEXT:    store ptr null, ptr [[TMP45]], align 8
+// CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK9-NEXT:    store i64 0, ptr [[TMP46]], align 8
+// CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK9-NEXT:    store i64 0, ptr [[TMP47]], align 8
+// CHECK9-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK9-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP48]], align 4
+// CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK9-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
+// CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK9-NEXT:    store i32 0, ptr [[TMP50]], align 4
+// CHECK9-NEXT:    [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.region_id, ptr [[KERNEL_ARGS5]])
+// CHECK9-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+// CHECK9-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK9:       omp_offload.failed6:
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81(i64 [[TMP32]]) #[[ATTR4]]
+// CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK9:       omp_offload.cont7:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK9-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
-// CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP49]], [[OMP_OFFLOAD_CONT6]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP53]], [[OMP_OFFLOAD_CONT7]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
 // CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
-// CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
-// CHECK9:       arraydestroy.done7:
+// CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK9:       arraydestroy.done8:
 // CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
-// CHECK9-NEXT:    [[TMP50:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK9-NEXT:    ret i32 [[TMP50]]
+// CHECK9-NEXT:    [[TMP54:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK9-NEXT:    ret i32 [[TMP54]]
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
@@ -877,89 +877,85 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75
-// CHECK9-SAME: (ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (i64 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
-// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK9-NEXT:    store i32 [[TMP4]], ptr [[T_VAR1]], align 128
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @.omp_outlined..4, ptr [[TMP1]], ptr [[T_VAR1]], ptr [[TMP2]], ptr [[TMP3]])
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @.omp_outlined..4, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP2]])
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_outlined..4
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
-// CHECK9-NEXT:    [[VEC2:%.*]] = alloca [2 x i32], align 128
-// CHECK9-NEXT:    [[S_ARR3:%.*]] = alloca [2 x %struct.S.0], align 128
+// CHECK9-NEXT:    [[VEC1:%.*]] = alloca [2 x i32], align 128
+// CHECK9-NEXT:    [[S_ARR2:%.*]] = alloca [2 x %struct.S.0], align 128
 // CHECK9-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
-// CHECK9-NEXT:    [[VAR5:%.*]] = alloca [[STRUCT_S_0:%.*]], align 128
-// CHECK9-NEXT:    [[AGG_TMP6:%.*]] = alloca [[STRUCT_ST]], align 4
+// CHECK9-NEXT:    [[VAR4:%.*]] = alloca [[STRUCT_S_0:%.*]], align 128
+// CHECK9-NEXT:    [[AGG_TMP5:%.*]] = alloca [[STRUCT_ST]], align 4
 // CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 128
-// CHECK9-NEXT:    store i32 [[TMP4]], ptr [[T_VAR1]], align 128
-// CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC2]], ptr align 128 [[TMP0]], i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
-// CHECK9-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
-// CHECK9-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i64 8, i1 false)
+// CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK9-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// CHECK9-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE3:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK9:       omp.arraycpy.body:
-// CHECK9-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK9-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK9-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK9-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]])
 // CHECK9-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr [[AGG_TMP]])
 // CHECK9-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK9-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK9-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
-// CHECK9-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
-// CHECK9:       omp.arraycpy.done4:
-// CHECK9-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
-// CHECK9-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR5]], ptr nonnull align 4 dereferenceable(4) [[TMP3]], ptr [[AGG_TMP6]])
-// CHECK9-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
-// CHECK9-NEXT:    [[TMP6:%.*]] = load i32, ptr [[T_VAR1]], align 128
-// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i64 0, i64 0
-// CHECK9-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX]], align 128
-// CHECK9-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i64 0, i64 0
-// CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[ARRAYIDX7]], ptr align 128 [[VAR5]], i64 4, i1 false)
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
-// CHECK9-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
+// CHECK9-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// CHECK9-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK9:       omp.arraycpy.done3:
+// CHECK9-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]])
+// CHECK9-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR4]], ptr nonnull align 4 dereferenceable(4) [[TMP2]], ptr [[AGG_TMP5]])
+// CHECK9-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK9-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
+// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC1]], i64 0, i64 0
+// CHECK9-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX]], align 128
+// CHECK9-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i64 0, i64 0
+// CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[ARRAYIDX6]], ptr align 128 [[VAR4]], i64 4, i1 false)
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR4]]
+// CHECK9-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
-// CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[OMP_ARRAYCPY_DONE4]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[OMP_ARRAYCPY_DONE3]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
 // CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
-// CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
-// CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
-// CHECK9:       arraydestroy.done9:
+// CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
+// CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK9:       arraydestroy.done8:
 // CHECK9-NEXT:    ret void
 //
 //
@@ -989,31 +985,27 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81
-// CHECK9-SAME: (ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (i64 [[T_VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
-// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK9-NEXT:    store i32 [[TMP1]], ptr [[T_VAR1]], align 128
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @.omp_outlined..7, ptr [[T_VAR1]])
+// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[T_VAR_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @.omp_outlined..7, i64 [[TMP1]])
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_outlined..7
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[T_VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK9-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
+// CHECK9-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK9-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK9-NEXT:    store i32 [[TMP1]], ptr [[T_VAR1]], align 128
+// CHECK9-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1438,14 +1430,16 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 128
 // CHECK11-NEXT:    [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 128
 // CHECK11-NEXT:    [[VAR:%.*]] = alloca [[STRUCT_S_0]], align 128
+// CHECK11-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [4 x ptr], align 4
 // CHECK11-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [4 x ptr], align 4
 // CHECK11-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [4 x ptr], align 4
 // CHECK11-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
-// CHECK11-NEXT:    [[DOTOFFLOAD_BASEPTRS1:%.*]] = alloca [1 x ptr], align 4
-// CHECK11-NEXT:    [[DOTOFFLOAD_PTRS2:%.*]] = alloca [1 x ptr], align 4
-// CHECK11-NEXT:    [[DOTOFFLOAD_MAPPERS3:%.*]] = alloca [1 x ptr], align 4
-// CHECK11-NEXT:    [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK11-NEXT:    [[T_VAR_CASTED1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
@@ -1454,121 +1448,127 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 2)
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 3)
-// CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK11-NEXT:    store ptr [[T_VAR]], ptr [[TMP0]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
-// CHECK11-NEXT:    store ptr [[T_VAR]], ptr [[TMP1]], align 4
-// CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
-// CHECK11-NEXT:    store ptr null, ptr [[TMP2]], align 4
-// CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
-// CHECK11-NEXT:    store ptr [[VEC]], ptr [[TMP3]], align 4
-// CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
-// CHECK11-NEXT:    store ptr [[VEC]], ptr [[TMP4]], align 4
-// CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
-// CHECK11-NEXT:    store ptr null, ptr [[TMP5]], align 4
-// CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
-// CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[TMP6]], align 4
-// CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
-// CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[TMP7]], align 4
-// CHECK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
-// CHECK11-NEXT:    store ptr null, ptr [[TMP8]], align 4
-// CHECK11-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
-// CHECK11-NEXT:    store ptr [[VAR]], ptr [[TMP9]], align 4
-// CHECK11-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
-// CHECK11-NEXT:    store ptr [[VAR]], ptr [[TMP10]], align 4
-// CHECK11-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
-// CHECK11-NEXT:    store ptr null, ptr [[TMP11]], align 4
-// CHECK11-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP14]], align 4
-// CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
-// CHECK11-NEXT:    store i32 4, ptr [[TMP15]], align 4
-// CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
-// CHECK11-NEXT:    store ptr [[TMP12]], ptr [[TMP16]], align 4
-// CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
-// CHECK11-NEXT:    store ptr [[TMP13]], ptr [[TMP17]], align 4
-// CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK11-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP18]], align 4
-// CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK11-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP19]], align 4
-// CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
-// CHECK11-NEXT:    store ptr null, ptr [[TMP20]], align 4
-// CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
-// CHECK11-NEXT:    store ptr null, ptr [[TMP21]], align 4
-// CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
-// CHECK11-NEXT:    store i64 0, ptr [[TMP22]], align 8
-// CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
-// CHECK11-NEXT:    store i64 0, ptr [[TMP23]], align 8
-// CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
-// CHECK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4
-// CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
-// CHECK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4
-// CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
-// CHECK11-NEXT:    store i32 0, ptr [[TMP26]], align 4
-// CHECK11-NEXT:    [[TMP27:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.region_id, ptr [[KERNEL_ARGS]])
-// CHECK11-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
-// CHECK11-NEXT:    br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
+// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[T_VAR_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT:    store i32 [[TMP1]], ptr [[TMP2]], align 4
+// CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT:    store i32 [[TMP1]], ptr [[TMP3]], align 4
+// CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK11-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK11-NEXT:    store ptr [[VEC]], ptr [[TMP5]], align 4
+// CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK11-NEXT:    store ptr [[VEC]], ptr [[TMP6]], align 4
+// CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK11-NEXT:    store ptr null, ptr [[TMP7]], align 4
+// CHECK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[TMP8]], align 4
+// CHECK11-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[TMP9]], align 4
+// CHECK11-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK11-NEXT:    store ptr null, ptr [[TMP10]], align 4
+// CHECK11-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK11-NEXT:    store ptr [[VAR]], ptr [[TMP11]], align 4
+// CHECK11-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK11-NEXT:    store ptr [[VAR]], ptr [[TMP12]], align 4
+// CHECK11-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK11-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CHECK11-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK11-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK11-NEXT:    store i32 4, ptr [[TMP17]], align 4
+// CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK11-NEXT:    store ptr [[TMP14]], ptr [[TMP18]], align 4
+// CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK11-NEXT:    store ptr [[TMP15]], ptr [[TMP19]], align 4
+// CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK11-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK11-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP21]], align 4
+// CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK11-NEXT:    store ptr null, ptr [[TMP22]], align 4
+// CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK11-NEXT:    store ptr null, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK11-NEXT:    store i64 0, ptr [[TMP24]], align 8
+// CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK11-NEXT:    store i64 0, ptr [[TMP25]], align 8
+// CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4
+// CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP27]], align 4
+// CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK11-NEXT:    store i32 0, ptr [[TMP28]], align 4
+// CHECK11-NEXT:    [[TMP29:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.region_id, ptr [[KERNEL_ARGS]])
+// CHECK11-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+// CHECK11-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i32 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
-// CHECK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
-// CHECK11-NEXT:    store ptr [[T_VAR]], ptr [[TMP29]], align 4
-// CHECK11-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
-// CHECK11-NEXT:    store ptr [[T_VAR]], ptr [[TMP30]], align 4
-// CHECK11-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS3]], i32 0, i32 0
-// CHECK11-NEXT:    store ptr null, ptr [[TMP31]], align 4
-// CHECK11-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP34]], align 4
-// CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
-// CHECK11-NEXT:    store i32 1, ptr [[TMP35]], align 4
-// CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
-// CHECK11-NEXT:    store ptr [[TMP32]], ptr [[TMP36]], align 4
-// CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3
-// CHECK11-NEXT:    store ptr [[TMP33]], ptr [[TMP37]], align 4
-// CHECK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4
-// CHECK11-NEXT:    store ptr @.offload_sizes.8, ptr [[TMP38]], align 4
-// CHECK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5
-// CHECK11-NEXT:    store ptr @.offload_maptypes.9, ptr [[TMP39]], align 4
-// CHECK11-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6
-// CHECK11-NEXT:    store ptr null, ptr [[TMP40]], align 4
-// CHECK11-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7
-// CHECK11-NEXT:    store ptr null, ptr [[TMP41]], align 4
-// CHECK11-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8
-// CHECK11-NEXT:    store i64 0, ptr [[TMP42]], align 8
-// CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9
-// CHECK11-NEXT:    store i64 0, ptr [[TMP43]], align 8
-// CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10
-// CHECK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP44]], align 4
-// CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11
-// CHECK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP45]], align 4
-// CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12
-// CHECK11-NEXT:    store i32 0, ptr [[TMP46]], align 4
-// CHECK11-NEXT:    [[TMP47:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.region_id, ptr [[KERNEL_ARGS4]])
-// CHECK11-NEXT:    [[TMP48:%.*]] = icmp ne i32 [[TMP47]], 0
-// CHECK11-NEXT:    br i1 [[TMP48]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
-// CHECK11:       omp_offload.failed5:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81(ptr [[T_VAR]]) #[[ATTR4]]
-// CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
-// CHECK11:       omp_offload.cont6:
+// CHECK11-NEXT:    [[TMP31:%.*]] = load i32, ptr [[T_VAR]], align 128
+// CHECK11-NEXT:    store i32 [[TMP31]], ptr [[T_VAR_CASTED1]], align 4
+// CHECK11-NEXT:    [[TMP32:%.*]] = load i32, ptr [[T_VAR_CASTED1]], align 4
+// CHECK11-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK11-NEXT:    store i32 [[TMP32]], ptr [[TMP33]], align 4
+// CHECK11-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK11-NEXT:    store i32 [[TMP32]], ptr [[TMP34]], align 4
+// CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i32 0, i32 0
+// CHECK11-NEXT:    store ptr null, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK11-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK11-NEXT:    store i32 1, ptr [[TMP39]], align 4
+// CHECK11-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK11-NEXT:    store ptr [[TMP36]], ptr [[TMP40]], align 4
+// CHECK11-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK11-NEXT:    store ptr [[TMP37]], ptr [[TMP41]], align 4
+// CHECK11-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK11-NEXT:    store ptr @.offload_sizes.8, ptr [[TMP42]], align 4
+// CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK11-NEXT:    store ptr @.offload_maptypes.9, ptr [[TMP43]], align 4
+// CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK11-NEXT:    store ptr null, ptr [[TMP44]], align 4
+// CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK11-NEXT:    store ptr null, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK11-NEXT:    store i64 0, ptr [[TMP46]], align 8
+// CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK11-NEXT:    store i64 0, ptr [[TMP47]], align 8
+// CHECK11-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP48]], align 4
+// CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK11-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
+// CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK11-NEXT:    store i32 0, ptr [[TMP50]], align 4
+// CHECK11-NEXT:    [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.region_id, ptr [[KERNEL_ARGS5]])
+// CHECK11-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+// CHECK11-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK11:       omp_offload.failed6:
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81(i32 [[TMP32]]) #[[ATTR4]]
+// CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK11:       omp_offload.cont7:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK11-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
-// CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP49]], [[OMP_OFFLOAD_CONT6]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP53]], [[OMP_OFFLOAD_CONT7]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
 // CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
-// CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
-// CHECK11:       arraydestroy.done7:
+// CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK11:       arraydestroy.done8:
 // CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
-// CHECK11-NEXT:    [[TMP50:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK11-NEXT:    ret i32 [[TMP50]]
+// CHECK11-NEXT:    [[TMP54:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK11-NEXT:    ret i32 [[TMP54]]
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
@@ -1678,89 +1678,85 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75
-// CHECK11-SAME: (ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (i32 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
-// CHECK11-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK11-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
-// CHECK11-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK11-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK11-NEXT:    store i32 [[TMP4]], ptr [[T_VAR1]], align 128
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @.omp_outlined..4, ptr [[TMP1]], ptr [[T_VAR1]], ptr [[TMP2]], ptr [[TMP3]])
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @.omp_outlined..4, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP2]])
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_outlined..4
-// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 4
-// CHECK11-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK11-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
-// CHECK11-NEXT:    [[VEC2:%.*]] = alloca [2 x i32], align 128
-// CHECK11-NEXT:    [[S_ARR3:%.*]] = alloca [2 x %struct.S.0], align 128
+// CHECK11-NEXT:    [[VEC1:%.*]] = alloca [2 x i32], align 128
+// CHECK11-NEXT:    [[S_ARR2:%.*]] = alloca [2 x %struct.S.0], align 128
 // CHECK11-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_ST:%.*]], align 4
-// CHECK11-NEXT:    [[VAR5:%.*]] = alloca [[STRUCT_S_0:%.*]], align 128
-// CHECK11-NEXT:    [[AGG_TMP6:%.*]] = alloca [[STRUCT_ST]], align 4
+// CHECK11-NEXT:    [[VAR4:%.*]] = alloca [[STRUCT_S_0:%.*]], align 128
+// CHECK11-NEXT:    [[AGG_TMP5:%.*]] = alloca [[STRUCT_ST]], align 4
 // CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
-// CHECK11-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 128
-// CHECK11-NEXT:    store i32 [[TMP4]], ptr [[T_VAR1]], align 128
-// CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC2]], ptr align 128 [[TMP0]], i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
-// CHECK11-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE4:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i32 8, i1 false)
+// CHECK11-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// CHECK11-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE3:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // CHECK11:       omp.arraycpy.body:
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP2]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK11-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK11-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]])
 // CHECK11-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr [[AGG_TMP]])
 // CHECK11-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK11-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// CHECK11-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
-// CHECK11-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
-// CHECK11:       omp.arraycpy.done4:
-// CHECK11-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
-// CHECK11-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR5]], ptr nonnull align 4 dereferenceable(4) [[TMP3]], ptr [[AGG_TMP6]])
-// CHECK11-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
-// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, ptr [[T_VAR1]], align 128
-// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX]], align 128
-// CHECK11-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[ARRAYIDX7]], ptr align 128 [[VAR5]], i32 4, i1 false)
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
-// CHECK11-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i32 2
+// CHECK11-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// CHECK11-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK11:       omp.arraycpy.done3:
+// CHECK11-NEXT:    call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]])
+// CHECK11-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR4]], ptr nonnull align 4 dereferenceable(4) [[TMP2]], ptr [[AGG_TMP5]])
+// CHECK11-NEXT:    call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
+// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC1]], i32 0, i32 0
+// CHECK11-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX]], align 128
+// CHECK11-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[ARRAYIDX6]], ptr align 128 [[VAR4]], i32 4, i1 false)
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR4]]
+// CHECK11-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
+// CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN7]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
-// CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[OMP_ARRAYCPY_DONE4]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[OMP_ARRAYCPY_DONE3]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
 // CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
-// CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
-// CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
-// CHECK11:       arraydestroy.done9:
+// CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
+// CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK11:       arraydestroy.done8:
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1790,31 +1786,27 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81
-// CHECK11-SAME: (ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (i32 [[T_VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
-// CHECK11-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK11-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
-// CHECK11-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK11-NEXT:    store i32 [[TMP1]], ptr [[T_VAR1]], align 128
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @.omp_outlined..7, ptr [[T_VAR1]])
+// CHECK11-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[T_VAR_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @.omp_outlined..7, i32 [[TMP1]])
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_outlined..7
-// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[T_VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
-// CHECK11-NEXT:    [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK11-NEXT:    [[T_VAR1:%.*]] = alloca i32, align 128
+// CHECK11-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
-// CHECK11-NEXT:    store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 128
-// CHECK11-NEXT:    store i32 [[TMP1]], ptr [[T_VAR1]], align 128
+// CHECK11-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK11-NEXT:    ret void
 //
 //

From 299f3ac5233fb411bc0432a1ed24724ff532f2c5 Mon Sep 17 00:00:00 2001
From: Jun Zhang <jun@junz.org>
Date: Wed, 22 Mar 2023 23:41:53 +0800
Subject: [PATCH 004/208] Regenerate checks for bswap.ll, NFC

Signed-off-by: Jun Zhang <jun@junz.org>
---
 llvm/test/Transforms/InstCombine/bswap.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll
index 09dbff00f0055..8c5c761c73e29 100644
--- a/llvm/test/Transforms/InstCombine/bswap.ll
+++ b/llvm/test/Transforms/InstCombine/bswap.ll
@@ -541,8 +541,8 @@ define i8 @PR39793_bswap_u32_as_u16_trunc(i32 %0) {
 
 define i32 @partial_bswap(i32 %x) {
 ; CHECK-LABEL: @partial_bswap(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[R]]
 ;
   %x3 = shl i32 %x, 24
   %a2 = shl i32 %x, 8
@@ -557,8 +557,8 @@ declare i32 @llvm.bswap.i32(i32)
 
 define <2 x i32> @partial_bswap_vector(<2 x i32> %x) {
 ; CHECK-LABEL: @partial_bswap_vector(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[X:%.*]])
-; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %x3 = shl <2 x i32> %x, <i32 24, i32 24>
   %a2 = shl <2 x i32> %x, <i32 8, i32 8>

From ada03565261ab6ef1c5bca217767fe7f69d19a99 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 22 Mar 2023 15:56:23 +0000
Subject: [PATCH 005/208] [X86] Extend all_of(icmp_eq()) / any_of(icmp_ne()) ->
 scalar integer fold to AVX512 targets

Extends 1bb95a3a99cb44f2b8b801e5137d3ac529253f3b to combine on AVX512 targets where the vXi1 type is legal

Continues work on addressing Issue #53419
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 32 +++++----
 llvm/test/CodeGen/X86/pr53419.ll        | 90 +++++--------------------
 2 files changed, 36 insertions(+), 86 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a491ba84bf705..6cf359d6d217a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44646,6 +44646,23 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
     // Special case for (pre-legalization) vXi1 reductions.
     if (NumElts > 64 || !isPowerOf2_32(NumElts))
       return SDValue();
+    if (Match.getOpcode() == ISD::SETCC) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
+      if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
+          (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
+        // If representable as a scalar integer:
+        // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
+        // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
+        EVT VecVT = Match.getOperand(0).getValueType();
+        EVT IntVT = EVT::getIntegerVT(Ctx, VecVT.getSizeInBits());
+        if (TLI.isTypeLegal(IntVT)) {
+          SDValue LHS = DAG.getFreeze(Match.getOperand(0));
+          SDValue RHS = DAG.getFreeze(Match.getOperand(1));
+          return DAG.getSetCC(DL, ExtractVT, DAG.getBitcast(IntVT, LHS),
+                              DAG.getBitcast(IntVT, RHS), CC);
+        }
+      }
+    }
     if (TLI.isTypeLegal(MatchVT)) {
       // If this is a legal AVX512 predicate type then we can just bitcast.
       EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
@@ -44657,20 +44674,7 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
         ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
         if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
             (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
-          EVT VecVT = Match.getOperand(0).getValueType();
-
-          // If representable as a scalar integer:
-          // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
-          // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
-          EVT IntVT = EVT::getIntegerVT(Ctx, VecVT.getSizeInBits());
-          if (TLI.isTypeLegal(IntVT)) {
-            SDValue LHS = DAG.getFreeze(Match.getOperand(0));
-            SDValue RHS = DAG.getFreeze(Match.getOperand(1));
-            return DAG.getSetCC(DL, ExtractVT, DAG.getBitcast(IntVT, LHS),
-                                DAG.getBitcast(IntVT, RHS), CC);
-          }
-
-          EVT VecSVT = VecVT.getScalarType();
+          EVT VecSVT = Match.getOperand(0).getValueType().getScalarType();
           if (VecSVT != MVT::i8 && (VecSVT.getSizeInBits() % 8) == 0) {
             NumElts *= VecSVT.getSizeInBits() / 8;
             EVT CmpVT = EVT::getVectorVT(Ctx, MVT::i8, NumElts);
diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll
index d92a7ceecec48..9455810fa2d78 100644
--- a/llvm/test/CodeGen/X86/pr53419.ll
+++ b/llvm/test/CodeGen/X86/pr53419.ll
@@ -13,32 +13,12 @@ declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>)
 ; FIXME: All four versions are semantically equivalent and should produce same asm as scalar version.
 
 define i1 @intrinsic_v2i8(ptr align 1 %arg, ptr align 1 %arg1) {
-; SSE-LABEL: intrinsic_v2i8:
-; SSE:       # %bb.0: # %bb
-; SSE-NEXT:    movzwl (%rdi), %eax
-; SSE-NEXT:    cmpw %ax, (%rsi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: intrinsic_v2i8:
-; AVX:       # %bb.0: # %bb
-; AVX-NEXT:    movzwl (%rdi), %eax
-; AVX-NEXT:    cmpw %ax, (%rsi)
-; AVX-NEXT:    sete %al
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: intrinsic_v2i8:
-; AVX512:       # %bb.0: # %bb
-; AVX512-NEXT:    movzwl (%rsi), %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    movzwl (%rdi), %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
-; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; AVX512-NEXT:    knotw %k0, %k0
-; AVX512-NEXT:    kmovd %k0, %eax
-; AVX512-NEXT:    testb $3, %al
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    retq
+; X64-LABEL: intrinsic_v2i8:
+; X64:       # %bb.0: # %bb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw %ax, (%rsi)
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 ;
 ; X86-LABEL: intrinsic_v2i8:
 ; X86:       # %bb.0: # %bb
@@ -57,30 +37,12 @@ bb:
 }
 
 define i1 @intrinsic_v4i8(ptr align 1 %arg, ptr align 1 %arg1) {
-; SSE-LABEL: intrinsic_v4i8:
-; SSE:       # %bb.0: # %bb
-; SSE-NEXT:    movl (%rdi), %eax
-; SSE-NEXT:    cmpl %eax, (%rsi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: intrinsic_v4i8:
-; AVX:       # %bb.0: # %bb
-; AVX-NEXT:    movl (%rdi), %eax
-; AVX-NEXT:    cmpl %eax, (%rsi)
-; AVX-NEXT:    sete %al
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: intrinsic_v4i8:
-; AVX512:       # %bb.0: # %bb
-; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; AVX512-NEXT:    knotw %k0, %k0
-; AVX512-NEXT:    kmovd %k0, %eax
-; AVX512-NEXT:    testb $15, %al
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    retq
+; X64-LABEL: intrinsic_v4i8:
+; X64:       # %bb.0: # %bb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl %eax, (%rsi)
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 ;
 ; X86-LABEL: intrinsic_v4i8:
 ; X86:       # %bb.0: # %bb
@@ -99,28 +61,12 @@ bb:
 }
 
 define i1 @intrinsic_v8i8(ptr align 1 %arg, ptr align 1 %arg1) {
-; SSE-LABEL: intrinsic_v8i8:
-; SSE:       # %bb.0: # %bb
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    cmpq %rax, (%rsi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: intrinsic_v8i8:
-; AVX:       # %bb.0: # %bb
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    cmpq %rax, (%rsi)
-; AVX-NEXT:    sete %al
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: intrinsic_v8i8:
-; AVX512:       # %bb.0: # %bb
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; AVX512-NEXT:    kortestb %k0, %k0
-; AVX512-NEXT:    setb %al
-; AVX512-NEXT:    retq
+; X64-LABEL: intrinsic_v8i8:
+; X64:       # %bb.0: # %bb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq %rax, (%rsi)
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 ;
 ; X86-LABEL: intrinsic_v8i8:
 ; X86:       # %bb.0: # %bb

From 71dc3de533b9247223c083a3b058859c9759099c Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 22 Mar 2023 16:00:19 +0000
Subject: [PATCH 006/208] [ARM] Improve min/max vector reductions on Arm

This patch adds some more efficient lowering for vecreduce.min/max under NEON,
using sequences of pairwise vpmin/vpmax to reduce to a single value.

This nearly resolves issues such as #50466, #40981, #38190.

Differential Revision: https://reviews.llvm.org/D146404
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp   |  87 +++++++++
 llvm/test/CodeGen/ARM/vecreduce-minmax.ll | 219 ++++++++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100644 llvm/test/CodeGen/ARM/vecreduce-minmax.ll

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 126bbc61a7d30..9c5f0df4d9468 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1007,6 +1007,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
       }
     }
+
+    for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
+                    MVT::v4i32}) {
+      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+    }
   }
 
   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
@@ -10271,6 +10279,80 @@ static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
   return LowerVecReduce(Op, DAG, ST);
 }
 
+static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  if (!ST->hasNEON())
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue Op0 = Op->getOperand(0);
+  EVT VT = Op0.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  unsigned PairwiseIntrinsic = 0;
+  switch (Op->getOpcode()) {
+  default:
+    llvm_unreachable("Expected VECREDUCE opcode");
+  case ISD::VECREDUCE_UMIN:
+    PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
+    break;
+  case ISD::VECREDUCE_UMAX:
+    PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
+    break;
+  case ISD::VECREDUCE_SMIN:
+    PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
+    break;
+  case ISD::VECREDUCE_SMAX:
+    PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
+    break;
+  }
+  SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumActiveLanes = NumElts;
+
+  assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
+          NumActiveLanes == 2) &&
+         "Only expected a power 2 vector size");
+
+  // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
+  if (VT.is128BitVector()) {
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
+    VT = Lo.getValueType();
+    Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
+    NumActiveLanes /= 2;
+  }
+
+  // Use pairwise reductions until one lane remains
+  while (NumActiveLanes > 1) {
+    Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
+    NumActiveLanes /= 2;
+  }
+
+  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                            DAG.getConstant(0, dl, MVT::i32));
+
+  // Result type may be wider than element type.
+  if (EltVT != Op.getValueType()) {
+    unsigned Extend = 0;
+    switch (Op->getOpcode()) {
+    default:
+      llvm_unreachable("Expected VECREDUCE opcode");
+    case ISD::VECREDUCE_UMIN:
+    case ISD::VECREDUCE_UMAX:
+      Extend = ISD::ZERO_EXTEND;
+      break;
+    case ISD::VECREDUCE_SMIN:
+    case ISD::VECREDUCE_SMAX:
+      Extend = ISD::SIGN_EXTEND;
+      break;
+    }
+    Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
+  }
+  return Res;
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
@@ -10502,6 +10584,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX:
     return LowerVecReduceF(Op, DAG, Subtarget);
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_SMAX:
+    return LowerVecReduceMinMax(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
diff --git a/llvm/test/CodeGen/ARM/vecreduce-minmax.ll b/llvm/test/CodeGen/ARM/vecreduce-minmax.ll
new file mode 100644
index 0000000000000..c392e6ca6bfa6
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/vecreduce-minmax.ll
@@ -0,0 +1,219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=hard -mattr=+neon -verify-machineinstrs | FileCheck %s
+
+define i8 @test_umin_v8i8(<8 x i8> %x) {
+; CHECK-LABEL: test_umin_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmin.u8 d16, d0, d0
+; CHECK-NEXT:    vpmin.u8 d16, d16, d16
+; CHECK-NEXT:    vpmin.u8 d16, d16, d16
+; CHECK-NEXT:    vmov.u8 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %x)
+  ret i8 %z
+}
+
+define i8 @test_smin_v8i8(<8 x i8> %x) {
+; CHECK-LABEL: test_smin_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmin.s8 d16, d0, d0
+; CHECK-NEXT:    vpmin.s8 d16, d16, d16
+; CHECK-NEXT:    vpmin.s8 d16, d16, d16
+; CHECK-NEXT:    vmov.s8 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %x)
+  ret i8 %z
+}
+
+define i8 @test_umax_v8i8(<8 x i8> %x) {
+; CHECK-LABEL: test_umax_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmax.u8 d16, d0, d0
+; CHECK-NEXT:    vpmax.u8 d16, d16, d16
+; CHECK-NEXT:    vpmax.u8 d16, d16, d16
+; CHECK-NEXT:    vmov.u8 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %x)
+  ret i8 %z
+}
+
+define i8 @test_smax_v8i8(<8 x i8> %x) {
+; CHECK-LABEL: test_smax_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmax.s8 d16, d0, d0
+; CHECK-NEXT:    vpmax.s8 d16, d16, d16
+; CHECK-NEXT:    vpmax.s8 d16, d16, d16
+; CHECK-NEXT:    vmov.s8 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %x)
+  ret i8 %z
+}
+
+define i16 @test_umin_v4i16(<4 x i16> %x) {
+; CHECK-LABEL: test_umin_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmin.u16 d16, d0, d0
+; CHECK-NEXT:    vpmin.u16 d16, d16, d16
+; CHECK-NEXT:    vmov.u16 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %x)
+  ret i16 %z
+}
+
+define i16 @test_smin_v4i16(<4 x i16> %x) {
+; CHECK-LABEL: test_smin_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmin.s16 d16, d0, d0
+; CHECK-NEXT:    vpmin.s16 d16, d16, d16
+; CHECK-NEXT:    vmov.s16 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %x)
+  ret i16 %z
+}
+
+define i16 @test_umax_v4i16(<4 x i16> %x) {
+; CHECK-LABEL: test_umax_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmax.u16 d16, d0, d0
+; CHECK-NEXT:    vpmax.u16 d16, d16, d16
+; CHECK-NEXT:    vmov.u16 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %x)
+  ret i16 %z
+}
+
+define i16 @test_smax_v4i16(<4 x i16> %x) {
+; CHECK-LABEL: test_smax_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmax.s16 d16, d0, d0
+; CHECK-NEXT:    vpmax.s16 d16, d16, d16
+; CHECK-NEXT:    vmov.s16 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %x)
+  ret i16 %z
+}
+
+define i32 @test_umin_v2i32(<2 x i32> %x) {
+; CHECK-LABEL: test_umin_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmin.u32 d16, d0, d0
+; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %x)
+  ret i32 %z
+}
+
+define i32 @test_smin_v2i32(<2 x i32> %x) {
+; CHECK-LABEL: test_smin_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmin.s32 d16, d0, d0
+; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %x)
+  ret i32 %z
+}
+
+define i32 @test_umax_v2i32(<2 x i32> %x) {
+; CHECK-LABEL: test_umax_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmax.u32 d16, d0, d0
+; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %x)
+  ret i32 %z
+}
+
+define i32 @test_smax_v2i32(<2 x i32> %x) {
+; CHECK-LABEL: test_smax_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmax.s32 d16, d0, d0
+; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %x)
+  ret i32 %z
+}
+
+define i8 @test_umin_v16i8(<16 x i8> %x) {
+; CHECK-LABEL: test_umin_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmin.u8 d16, d0, d1
+; CHECK-NEXT:    vpmin.u8 d16, d16, d16
+; CHECK-NEXT:    vpmin.u8 d16, d16, d16
+; CHECK-NEXT:    vpmin.u8 d16, d16, d16
+; CHECK-NEXT:    vmov.u8 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %x)
+  ret i8 %z
+}
+
+define i16 @test_smin_v8i16(<8 x i16> %x) {
+; CHECK-LABEL: test_smin_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmin.s16 d16, d0, d1
+; CHECK-NEXT:    vpmin.s16 d16, d16, d16
+; CHECK-NEXT:    vpmin.s16 d16, d16, d16
+; CHECK-NEXT:    vmov.s16 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %x)
+  ret i16 %z
+}
+
+define i32 @test_umax_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: test_umax_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vpmax.u32 d16, d0, d1
+; CHECK-NEXT:    vpmax.u32 d16, d16, d16
+; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %x)
+  ret i32 %z
+}
+
+define i8 @test_umin_v32i8(<32 x i8> %x) {
+; CHECK-LABEL: test_umin_v32i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmin.u8 q8, q0, q1
+; CHECK-NEXT:    vpmin.u8 d16, d16, d17
+; CHECK-NEXT:    vpmin.u8 d16, d16, d16
+; CHECK-NEXT:    vpmin.u8 d16, d16, d16
+; CHECK-NEXT:    vpmin.u8 d16, d16, d16
+; CHECK-NEXT:    vmov.u8 r0, d16[0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %x)
+  ret i8 %z
+}
+
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
+
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+
+declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)

From 06f16232b1b0028ac87d584883bc32220882c73a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 20 Mar 2023 17:31:40 +0000
Subject: [PATCH 007/208] [RISCV][NFC] Make interleaved access test more
 vectorizable

The previous test case stored the result of a deinterleaved load and add
into the same source address, which resulted in some scatters which we
weren't testing for and made the tests harder to understand.
Store it at a separate address, which will make the tests easier to read
when the cost model is changed after D145085 is landed

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D146442
---
 .../RISCV/interleaved-accesses.ll             | 62 ++++++++++---------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index b81d14c520770..d51f7becebeb5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -395,7 +395,7 @@ exit:
   ret void
 }
 
-define void @combine_load_factor2_i32(ptr %p) {
+define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -403,30 +403,31 @@ define void @combine_load_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP0:%.*]] = shl <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> [[STEP_ADD]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], <4 x i64> [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P]], <4 x i64> [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP8]], <4 x ptr> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 4
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -442,7 +443,8 @@ define void @combine_load_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; CHECK-NEXT:    store i32 [[RES]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
 ; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -464,7 +466,8 @@ loop:
 
   %res = add i32 %x0, %x1
 
-  store i32 %res, ptr %q0
+  %dst = getelementptr i32, ptr %q, i64 %i
+  store i32 %res, ptr %dst
 
   %nexti = add i64 %i, 1
   %done = icmp eq i64 %nexti, 1024
@@ -473,7 +476,7 @@ exit:
   ret void
 }
 
-define void @combine_load_factor2_i64(ptr %p) {
+define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -486,7 +489,8 @@ define void @combine_load_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    store i64 [[RES]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]]
+; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
 ; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
@@ -508,7 +512,8 @@ loop:
 
   %res = add i64 %x0, %x1
 
-  store i64 %res, ptr %q0
+  %dst = getelementptr i64, ptr %q, i64 %i
+  store i64 %res, ptr %dst
 
   %nexti = add i64 %i, 1
   %done = icmp eq i64 %nexti, 1024
@@ -516,3 +521,4 @@ loop:
 exit:
   ret void
 }
+

From 65890469cebb675e9fa0271dc1ab3b1da15df302 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto@arm.com>
Date: Wed, 22 Mar 2023 14:57:18 +0000
Subject: [PATCH 008/208] [AArch64] Add asm aliases for MOV, LDR, STR with
 predicate-as-counter

In the 2022-12 release of the A64 ISA it was updated that the assembler must
also accept predicate-as-counter register names for the source predicate
register and the destination predicate register for:
 * *MOV: Move predicate (unpredicated)*
 * *LDR (predicate): Load predicate register*
 * *STR (predicate): Store predicate register*

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D146311
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 19 +++++++
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  2 -
 llvm/test/MC/AArch64/SVE/pfalse.s             |  6 ---
 .../SVE/predicate-as-counter-aliases.s        | 50 +++++++++++++++++++
 4 files changed, 69 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 858b352c8c72e..ba33e9cfe949c 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3844,6 +3844,25 @@ defm WHILEHS_CXX  : sve2p1_int_while_rr_pn<"whilehs", 0b100>;
 defm WHILEHI_CXX  : sve2p1_int_while_rr_pn<"whilehi", 0b101>;
 defm WHILELO_CXX  : sve2p1_int_while_rr_pn<"whilelo", 0b110>;
 defm WHILELS_CXX  : sve2p1_int_while_rr_pn<"whilels", 0b111>;
+
+
+// Aliases for existing SVE instructions for which predicate-as-counter are
+// accepted as an operand to the instruction
+def : InstAlias<"ldr $Pt, [$Rn, $imm9, mul vl]",
+               (LDR_PXI PNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>;
+def : InstAlias<"ldr $Pt, [$Rn]",
+               (LDR_PXI PNRAny:$Pt, GPR64sp:$Rn, 0), 0>;
+
+def : InstAlias<"str $Pt, [$Rn, $imm9, mul vl]",
+               (STR_PXI PNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>;
+def : InstAlias<"str $Pt, [$Rn]",
+               (STR_PXI PNRAny:$Pt, GPR64sp:$Rn, 0), 0>;
+
+def : InstAlias<"mov $Pd, $Pn",
+               (ORR_PPzPP PNR8:$Pd, PNR8:$Pn, PNR8:$Pn, PNR8:$Pn), 0>;
+
+def : InstAlias<"pfalse\t$Pd", (PFALSE PNR8:$Pd), 0>;
+
 } // End HasSVE2p1_or_HasSME2
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1d3bf9150ca41..736d5b40ccb8d 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -728,8 +728,6 @@ class sve_int_pfalse<bits<6> opc, string asm>
 multiclass sve_int_pfalse<bits<6> opc, string asm> {
   def NAME : sve_int_pfalse<opc, asm>;
 
-  def : InstAlias<"pfalse\t$Pd", (!cast<Instruction>(NAME) PNR8:$Pd), 0>;
-
   def : Pat<(nxv16i1 immAllZerosV), (!cast<Instruction>(NAME))>;
   def : Pat<(nxv8i1 immAllZerosV), (!cast<Instruction>(NAME))>;
   def : Pat<(nxv4i1 immAllZerosV), (!cast<Instruction>(NAME))>;
diff --git a/llvm/test/MC/AArch64/SVE/pfalse.s b/llvm/test/MC/AArch64/SVE/pfalse.s
index 7ac4d5c44f433..4124da8ac92a0 100644
--- a/llvm/test/MC/AArch64/SVE/pfalse.s
+++ b/llvm/test/MC/AArch64/SVE/pfalse.s
@@ -14,9 +14,3 @@ pfalse p15.b
 // CHECK-ENCODING: [0x0f,0xe4,0x18,0x25]
 // CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 2518e40f <unknown>
-
-pfalse pn15.b
-// CHECK-INST: pfalse  p15.b
-// CHECK-ENCODING: [0x0f,0xe4,0x18,0x25]
-// CHECK-ERROR: instruction requires: sve or sme
-// CHECK-UNKNOWN: 2518e40f <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s b/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s
new file mode 100644
index 0000000000000..bca2cf913ff64
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s
@@ -0,0 +1,50 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p1 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p1 < %s \
+// RUN:        | llvm-objdump --no-print-imm-hex -d --mattr=+sve2p1 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p1 < %s \
+// RUN:   | llvm-objdump --no-print-imm-hex -d --mattr=-sve - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+
+ldr     pn0, [x0]
+// CHECK-INST: ldr     p0, [x0]
+// CHECK-ENCODING: [0x00,0x00,0x80,0x85]
+// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-UNKNOWN: 85800000 <unknown>
+
+ldr     pn5, [x10, #255, mul vl]
+// CHECK-INST: ldr     p5, [x10, #255, mul vl]
+// CHECK-ENCODING: [0x45,0x1d,0x9f,0x85]
+// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-UNKNOWN: 859f1d45 <unknown>
+
+
+str     pn0, [x0]
+// CHECK-INST: str     p0, [x0]
+// CHECK-ENCODING: [0x00,0x00,0x80,0xe5]
+// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-UNKNOWN: e5800000 <unknown>
+
+str     pn5, [x10, #255, mul vl]
+// CHECK-INST: str     p5, [x10, #255, mul vl]
+// CHECK-ENCODING: [0x45,0x1d,0x9f,0xe5]
+// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-UNKNOWN: e59f1d45 <unknown>
+
+
+mov     pn0.b, pn0.b
+// CHECK-INST: mov     p0.b, p0.b
+// CHECK-ENCODING: [0x00,0x40,0x80,0x25]
+// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-UNKNOWN: 25804000 <unknown>
+
+
+pfalse pn15.b
+// CHECK-INST: pfalse  p15.b
+// CHECK-ENCODING: [0x0f,0xe4,0x18,0x25]
+// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-UNKNOWN: 2518e40f <unknown>

From 0528087663f1558a2f662d4317b0b63d8f4a6fca Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 22 Mar 2023 09:21:12 -0700
Subject: [PATCH 009/208] [NFC][WebAssembly] Autogenerate test expectations for
 tailcall.ll

A follow-on commit will add tests to this file and using the
update_llc_test_checks script will make that easier.

Differential Revision: https://reviews.llvm.org/D146568
---
 llvm/test/CodeGen/WebAssembly/tailcall.ll | 445 ++++++++++++++++++----
 1 file changed, 367 insertions(+), 78 deletions(-)

diff --git a/llvm/test/CodeGen/WebAssembly/tailcall.ll b/llvm/test/CodeGen/WebAssembly/tailcall.ll
index 07cdea1ec9b0f..34dd0a9a424b6 100644
--- a/llvm/test/CodeGen/WebAssembly/tailcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/tailcall.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+tail-call | FileCheck --check-prefixes=CHECK,SLOW %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -mcpu=mvp -mattr=+tail-call | FileCheck --check-prefixes=CHECK,FAST %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+tail-call | FileCheck --check-prefixes=CHECK,SLOW %s
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -mcpu=mvp -mattr=+tail-call | FileCheck --check-prefixes=CHECK,FAST %s
 ; RUN: llc < %s --filetype=obj -mattr=+tail-call | obj2yaml | FileCheck --check-prefix=YAML %s
 
 ; Test that the tail calls lower correctly
@@ -10,101 +11,177 @@ target triple = "wasm32-unknown-unknown"
 declare i1 @foo(i1)
 declare i1 @bar(i1)
 
-; CHECK-LABEL: recursive_notail_nullary:
-; CHECK: {{^}} call recursive_notail_nullary{{$}}
-; CHECK-NEXT: return
 define void @recursive_notail_nullary() {
+; CHECK-LABEL: recursive_notail_nullary:
+; CHECK:         .functype recursive_notail_nullary () -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call recursive_notail_nullary
+; CHECK-NEXT:    return
   notail call void @recursive_notail_nullary()
   ret void
 }
 
-; CHECK-LABEL: recursive_musttail_nullary:
-; CHECK: return_call recursive_musttail_nullary{{$}}
 define void @recursive_musttail_nullary() {
+; CHECK-LABEL: recursive_musttail_nullary:
+; CHECK:         .functype recursive_musttail_nullary () -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    return_call recursive_musttail_nullary
   musttail call void @recursive_musttail_nullary()
   ret void
 }
-
-; CHECK-LABEL: recursive_tail_nullary:
-; SLOW: return_call recursive_tail_nullary{{$}}
-; FAST: {{^}} call recursive_tail_nullary{{$}}
-; FAST-NEXT: return{{$}}
 define void @recursive_tail_nullary() {
+; SLOW-LABEL: recursive_tail_nullary:
+; SLOW:         .functype recursive_tail_nullary () -> ()
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    return_call recursive_tail_nullary
+;
+; FAST-LABEL: recursive_tail_nullary:
+; FAST:         .functype recursive_tail_nullary () -> ()
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    call recursive_tail_nullary
+; FAST-NEXT:    return
   tail call void @recursive_tail_nullary()
   ret void
 }
 
-; CHECK-LABEL: recursive_notail:
-; CHECK: call $push[[L:[0-9]+]]=, recursive_notail, $0, $1{{$}}
-; CHECK-NEXT: return $pop[[L]]{{$}}
 define i32 @recursive_notail(i32 %x, i32 %y) {
+; CHECK-LABEL: recursive_notail:
+; CHECK:         .functype recursive_notail (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $push0=, recursive_notail, $0, $1
+; CHECK-NEXT:    return $pop0
   %v = notail call i32 @recursive_notail(i32 %x, i32 %y)
   ret i32 %v
 }
 
-; CHECK-LABEL: recursive_musttail:
-; CHECK: return_call recursive_musttail, $0, $1{{$}}
 define i32 @recursive_musttail(i32 %x, i32 %y) {
+; CHECK-LABEL: recursive_musttail:
+; CHECK:         .functype recursive_musttail (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    return_call recursive_musttail, $0, $1
   %v = musttail call i32 @recursive_musttail(i32 %x, i32 %y)
   ret i32 %v
 }
 
-; CHECK-LABEL: recursive_tail:
-; SLOW: return_call recursive_tail, $0, $1{{$}}
-; FAST: call $push[[L:[0-9]+]]=, recursive_tail, $0, $1{{$}}
-; FAST-NEXT: return $pop[[L]]{{$}}
 define i32 @recursive_tail(i32 %x, i32 %y) {
+; SLOW-LABEL: recursive_tail:
+; SLOW:         .functype recursive_tail (i32, i32) -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    return_call recursive_tail, $0, $1
+;
+; FAST-LABEL: recursive_tail:
+; FAST:         .functype recursive_tail (i32, i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    call $push0=, recursive_tail, $0, $1
+; FAST-NEXT:    return $pop0
   %v = tail call i32 @recursive_tail(i32 %x, i32 %y)
   ret i32 %v
 }
 
-; CHECK-LABEL: indirect_notail:
-; CHECK: call_indirect $push[[L:[0-9]+]]=, $0, $1, $2, $0{{$}}
-; CHECK-NEXT: return $pop[[L]]{{$}}
 define i32 @indirect_notail(%fn %f, i32 %x, i32 %y) {
+; CHECK-LABEL: indirect_notail:
+; CHECK:         .functype indirect_notail (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call_indirect $push0=, $0, $1, $2, $0 # Invalid depth argument!
+; CHECK-NEXT:    return $pop0
   %p = extractvalue %fn %f, 0
   %v = notail call i32 %p(%fn %f, i32 %x, i32 %y)
   ret i32 %v
 }
 
-; CHECK-LABEL: indirect_musttail:
-; CHECK: return_call_indirect , $0, $1, $2, $0{{$}}
 define i32 @indirect_musttail(%fn %f, i32 %x, i32 %y) {
+; CHECK-LABEL: indirect_musttail:
+; CHECK:         .functype indirect_musttail (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    return_call_indirect , $0, $1, $2, $0
   %p = extractvalue %fn %f, 0
   %v = musttail call i32 %p(%fn %f, i32 %x, i32 %y)
   ret i32 %v
 }
 
-; CHECK-LABEL: indirect_tail:
-; CHECK: return_call_indirect , $0, $1, $2, $0{{$}}
 define i32 @indirect_tail(%fn %f, i32 %x, i32 %y) {
+; CHECK-LABEL: indirect_tail:
+; CHECK:         .functype indirect_tail (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    return_call_indirect , $0, $1, $2, $0
   %p = extractvalue %fn %f, 0
   %v = tail call i32 %p(%fn %f, i32 %x, i32 %y)
   ret i32 %v
 }
 
-; CHECK-LABEL: choice_notail:
-; CHECK: call_indirect $push[[L:[0-9]+]]=, $0, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[L]]{{$}}
 define i1 @choice_notail(i1 %x) {
+; SLOW-LABEL: choice_notail:
+; SLOW:         .functype choice_notail (i32) -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push3=, foo
+; SLOW-NEXT:    i32.const $push2=, bar
+; SLOW-NEXT:    i32.const $push0=, 1
+; SLOW-NEXT:    i32.and $push1=, $0, $pop0
+; SLOW-NEXT:    i32.select $push4=, $pop3, $pop2, $pop1
+; SLOW-NEXT:    call_indirect $push5=, $0, $pop4 # Invalid depth argument!
+; SLOW-NEXT:    return $pop5
+;
+; FAST-LABEL: choice_notail:
+; FAST:         .functype choice_notail (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push3=, foo
+; FAST-NEXT:    i32.const $push4=, bar
+; FAST-NEXT:    i32.const $push1=, 1
+; FAST-NEXT:    i32.and $push2=, $0, $pop1
+; FAST-NEXT:    i32.select $push5=, $pop3, $pop4, $pop2
+; FAST-NEXT:    call_indirect $push0=, $0, $pop5 # Invalid depth argument!
+; FAST-NEXT:    return $pop0
   %p = select i1 %x, ptr @foo, ptr @bar
   %v = notail call i1 %p(i1 %x)
   ret i1 %v
 }
 
-; CHECK-LABEL: choice_musttail:
-; CHECK: return_call_indirect , $0, $pop{{[0-9]+}}{{$}}
 define i1 @choice_musttail(i1 %x) {
+; SLOW-LABEL: choice_musttail:
+; SLOW:         .functype choice_musttail (i32) -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push3=, foo
+; SLOW-NEXT:    i32.const $push2=, bar
+; SLOW-NEXT:    i32.const $push0=, 1
+; SLOW-NEXT:    i32.and $push1=, $0, $pop0
+; SLOW-NEXT:    i32.select $push4=, $pop3, $pop2, $pop1
+; SLOW-NEXT:    return_call_indirect , $0, $pop4
+;
+; FAST-LABEL: choice_musttail:
+; FAST:         .functype choice_musttail (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push4=, foo
+; FAST-NEXT:    i32.const $push3=, bar
+; FAST-NEXT:    i32.const $push1=, 1
+; FAST-NEXT:    i32.and $push2=, $0, $pop1
+; FAST-NEXT:    i32.select $push0=, $pop4, $pop3, $pop2
+; FAST-NEXT:    return_call_indirect , $0, $pop0
   %p = select i1 %x, ptr @foo, ptr @bar
   %v = musttail call i1 %p(i1 %x)
   ret i1 %v
 }
 
-; CHECK-LABEL: choice_tail:
-; SLOW: return_call_indirect , $0, $pop{{[0-9]+}}{{$}}
-; FAST: call_indirect $push[[L:[0-9]+]]=, $0, $pop{{[0-9]+}}{{$}}
-; FAST: return $pop[[L]]{{$}}
 define i1 @choice_tail(i1 %x) {
+; SLOW-LABEL: choice_tail:
+; SLOW:         .functype choice_tail (i32) -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push3=, foo
+; SLOW-NEXT:    i32.const $push2=, bar
+; SLOW-NEXT:    i32.const $push0=, 1
+; SLOW-NEXT:    i32.and $push1=, $0, $pop0
+; SLOW-NEXT:    i32.select $push4=, $pop3, $pop2, $pop1
+; SLOW-NEXT:    return_call_indirect , $0, $pop4
+;
+; FAST-LABEL: choice_tail:
+; FAST:         .functype choice_tail (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push3=, foo
+; FAST-NEXT:    i32.const $push4=, bar
+; FAST-NEXT:    i32.const $push1=, 1
+; FAST-NEXT:    i32.and $push2=, $0, $pop1
+; FAST-NEXT:    i32.select $push5=, $pop3, $pop4, $pop2
+; FAST-NEXT:    call_indirect $push0=, $0, $pop5 # Invalid depth argument!
+; FAST-NEXT:    return $pop0
   %p = select i1 %x, ptr @foo, ptr @bar
   %v = tail call i1 %p(i1 %x)
   ret i1 %v
@@ -114,95 +191,200 @@ define i1 @choice_tail(i1 %x) {
 ; prototype than its caller, so the following tests can only be done with
 ; 'tail'.
 
-; CHECK-LABEL: mismatched_prototypes:
-; SLOW: return_call baz, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; FAST: call $push[[L:[0-9]+]]=, baz, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; FAST: return $pop[[L]]{{$}}
 declare i32 @baz(i32, i32, i32)
 define i32 @mismatched_prototypes() {
+; SLOW-LABEL: mismatched_prototypes:
+; SLOW:         .functype mismatched_prototypes () -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push2=, 0
+; SLOW-NEXT:    i32.const $push1=, 42
+; SLOW-NEXT:    i32.const $push0=, 6
+; SLOW-NEXT:    return_call baz, $pop2, $pop1, $pop0
+;
+; FAST-LABEL: mismatched_prototypes:
+; FAST:         .functype mismatched_prototypes () -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push1=, 0
+; FAST-NEXT:    i32.const $push2=, 42
+; FAST-NEXT:    i32.const $push3=, 6
+; FAST-NEXT:    call $push0=, baz, $pop1, $pop2, $pop3
+; FAST-NEXT:    return $pop0
   %v = tail call i32 @baz(i32 0, i32 42, i32 6)
   ret i32 %v
 }
 
-; CHECK-LABEL: mismatched_return_void:
-; CHECK: call $drop=, baz, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK: return{{$}}
 define void @mismatched_return_void() {
+; SLOW-LABEL: mismatched_return_void:
+; SLOW:         .functype mismatched_return_void () -> ()
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push2=, 0
+; SLOW-NEXT:    i32.const $push1=, 42
+; SLOW-NEXT:    i32.const $push0=, 6
+; SLOW-NEXT:    call $drop=, baz, $pop2, $pop1, $pop0
+; SLOW-NEXT:    return
+;
+; FAST-LABEL: mismatched_return_void:
+; FAST:         .functype mismatched_return_void () -> ()
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push0=, 0
+; FAST-NEXT:    i32.const $push1=, 42
+; FAST-NEXT:    i32.const $push2=, 6
+; FAST-NEXT:    call $drop=, baz, $pop0, $pop1, $pop2
+; FAST-NEXT:    return
   %v = tail call i32 @baz(i32 0, i32 42, i32 6)
   ret void
 }
 
-; CHECK-LABEL: mismatched_return_f32:
-; CHECK: call $push[[L:[0-9]+]]=, baz, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK: f32.reinterpret_i32 $push[[L1:[0-9]+]]=, $pop[[L]]{{$}}
-; CHECK: return $pop[[L1]]{{$}}
 define float @mismatched_return_f32() {
+; SLOW-LABEL: mismatched_return_f32:
+; SLOW:         .functype mismatched_return_f32 () -> (f32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push2=, 0
+; SLOW-NEXT:    i32.const $push1=, 42
+; SLOW-NEXT:    i32.const $push0=, 6
+; SLOW-NEXT:    call $push3=, baz, $pop2, $pop1, $pop0
+; SLOW-NEXT:    f32.reinterpret_i32 $push4=, $pop3
+; SLOW-NEXT:    return $pop4
+;
+; FAST-LABEL: mismatched_return_f32:
+; FAST:         .functype mismatched_return_f32 () -> (f32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push2=, 0
+; FAST-NEXT:    i32.const $push3=, 42
+; FAST-NEXT:    i32.const $push4=, 6
+; FAST-NEXT:    call $push1=, baz, $pop2, $pop3, $pop4
+; FAST-NEXT:    f32.reinterpret_i32 $push0=, $pop1
+; FAST-NEXT:    return $pop0
   %v = tail call i32 @baz(i32 0, i32 42, i32 6)
   %u = bitcast i32 %v to float
   ret float %u
 }
 
-; CHECK-LABEL: mismatched_indirect_void:
-; CHECK: call_indirect $drop=, $0, $1, $2, $0{{$}}
-; CHECK: return{{$}}
 define void @mismatched_indirect_void(%fn %f, i32 %x, i32 %y) {
+; CHECK-LABEL: mismatched_indirect_void:
+; CHECK:         .functype mismatched_indirect_void (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call_indirect $drop=, $0, $1, $2, $0 # Invalid depth argument!
+; CHECK-NEXT:    return
   %p = extractvalue %fn %f, 0
   %v = tail call i32 %p(%fn %f, i32 %x, i32 %y)
   ret void
 }
 
-; CHECK-LABEL: mismatched_indirect_f32:
-; CHECK: call_indirect $push[[L:[0-9]+]]=, $0, $1, $2, $0{{$}}
-; CHECK: f32.reinterpret_i32 $push[[L1:[0-9]+]]=, $pop[[L]]{{$}}
-; CHECK: return $pop[[L1]]{{$}}
 define float @mismatched_indirect_f32(%fn %f, i32 %x, i32 %y) {
+; CHECK-LABEL: mismatched_indirect_f32:
+; CHECK:         .functype mismatched_indirect_f32 (i32, i32, i32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call_indirect $push0=, $0, $1, $2, $0 # Invalid depth argument!
+; CHECK-NEXT:    f32.reinterpret_i32 $push1=, $pop0
+; CHECK-NEXT:    return $pop1
   %p = extractvalue %fn %f, 0
   %v = tail call i32 %p(%fn %f, i32 %x, i32 %y)
   %u = bitcast i32 %v to float
   ret float %u
 }
 
-; CHECK-LABEL: mismatched_byval:
-; CHECK: i32.store
-; CHECK: return_call quux, $pop{{[0-9]+}}{{$}}
 declare i32 @quux(ptr byval(i32))
 define i32 @mismatched_byval(ptr %x) {
+; CHECK-LABEL: mismatched_byval:
+; CHECK:         .functype mismatched_byval (i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-NEXT:    i32.const $push2=, 16
+; CHECK-NEXT:    i32.sub $push8=, $pop1, $pop2
+; CHECK-NEXT:    local.tee $push7=, $1=, $pop8
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.load $push0=, 0($0)
+; CHECK-NEXT:    i32.store 12($1), $pop0
+; CHECK-NEXT:    i32.const $push3=, 16
+; CHECK-NEXT:    i32.add $push4=, $1, $pop3
+; CHECK-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-NEXT:    i32.const $push5=, 12
+; CHECK-NEXT:    i32.add $push6=, $1, $pop5
+; CHECK-NEXT:    return_call quux, $pop6
   %v = tail call i32 @quux(ptr byval(i32) %x)
   ret i32 %v
 }
 
-; CHECK-LABEL: varargs:
-; CHECK: i32.store
-; CHECK: call $0=, var, $1{{$}}
-; CHECK: return $0{{$}}
 declare i32 @var(...)
 define i32 @varargs(i32 %x) {
+; CHECK-LABEL: varargs:
+; CHECK:         .functype varargs (i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-NEXT:    i32.const $push1=, 16
+; CHECK-NEXT:    i32.sub $push5=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push4=, $1=, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-NEXT:    i32.store 0($1), $0
+; CHECK-NEXT:    call $0=, var, $1
+; CHECK-NEXT:    i32.const $push2=, 16
+; CHECK-NEXT:    i32.add $push3=, $1, $pop2
+; CHECK-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-NEXT:    return $0
   %v = tail call i32 (...) @var(i32 %x)
   ret i32 %v
 }
 
 ; Type transformations inhibit tail calls, even when they are nops
 
-; CHECK-LABEL: mismatched_return_zext:
-; CHECK: call
 define i32 @mismatched_return_zext() {
+; SLOW-LABEL: mismatched_return_zext:
+; SLOW:         .functype mismatched_return_zext () -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push0=, 1
+; SLOW-NEXT:    call $push1=, foo, $pop0
+; SLOW-NEXT:    i32.const $push3=, 1
+; SLOW-NEXT:    i32.and $push2=, $pop1, $pop3
+; SLOW-NEXT:    return $pop2
+;
+; FAST-LABEL: mismatched_return_zext:
+; FAST:         .functype mismatched_return_zext () -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push2=, 1
+; FAST-NEXT:    call $push1=, foo, $pop2
+; FAST-NEXT:    i32.const $push3=, 1
+; FAST-NEXT:    i32.and $push0=, $pop1, $pop3
+; FAST-NEXT:    return $pop0
   %v = tail call i1 @foo(i1 1)
   %u = zext i1 %v to i32
   ret i32 %u
 }
 
-; CHECK-LABEL: mismatched_return_sext:
-; CHECK: call
 define i32 @mismatched_return_sext() {
+; SLOW-LABEL: mismatched_return_sext:
+; SLOW:         .functype mismatched_return_sext () -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push3=, 0
+; SLOW-NEXT:    i32.const $push0=, 1
+; SLOW-NEXT:    call $push1=, foo, $pop0
+; SLOW-NEXT:    i32.const $push5=, 1
+; SLOW-NEXT:    i32.and $push2=, $pop1, $pop5
+; SLOW-NEXT:    i32.sub $push4=, $pop3, $pop2
+; SLOW-NEXT:    return $pop4
+;
+; FAST-LABEL: mismatched_return_sext:
+; FAST:         .functype mismatched_return_sext () -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push4=, 1
+; FAST-NEXT:    call $push3=, foo, $pop4
+; FAST-NEXT:    i32.const $push0=, 31
+; FAST-NEXT:    i32.shl $push1=, $pop3, $pop0
+; FAST-NEXT:    i32.const $push5=, 31
+; FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop5
+; FAST-NEXT:    return $pop2
   %v = tail call i1 @foo(i1 1)
   %u = sext i1 %v to i32
   ret i32 %u
 }
 
-; CHECK-LABEL: mismatched_return_trunc:
-; CHECK: call
 declare i32 @int()
 define i1 @mismatched_return_trunc() {
+; CHECK-LABEL: mismatched_return_trunc:
+; CHECK:         .functype mismatched_return_trunc () -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $push0=, int
+; CHECK-NEXT:    return $pop0
   %v = tail call i32 @int()
   %u = trunc i32 %v to i1
   ret i1 %u
@@ -210,30 +392,115 @@ define i1 @mismatched_return_trunc() {
 
 ; Stack-allocated arguments inhibit tail calls
 
-; CHECK-LABEL: stack_arg:
-; CHECK: call
 define i32 @stack_arg(ptr %x) {
+; SLOW-LABEL: stack_arg:
+; SLOW:         .functype stack_arg (i32) -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    global.get $push0=, __stack_pointer
+; SLOW-NEXT:    i32.const $push1=, 16
+; SLOW-NEXT:    i32.sub $push7=, $pop0, $pop1
+; SLOW-NEXT:    local.tee $push6=, $2=, $pop7
+; SLOW-NEXT:    global.set __stack_pointer, $pop6
+; SLOW-NEXT:    i32.const $push4=, 12
+; SLOW-NEXT:    i32.add $push5=, $2, $pop4
+; SLOW-NEXT:    call $1=, stack_arg, $pop5
+; SLOW-NEXT:    i32.const $push2=, 16
+; SLOW-NEXT:    i32.add $push3=, $2, $pop2
+; SLOW-NEXT:    global.set __stack_pointer, $pop3
+; SLOW-NEXT:    return $1
+;
+; FAST-LABEL: stack_arg:
+; FAST:         .functype stack_arg (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    global.get $push1=, __stack_pointer
+; FAST-NEXT:    i32.const $push2=, 16
+; FAST-NEXT:    i32.sub $push8=, $pop1, $pop2
+; FAST-NEXT:    local.tee $push7=, $2=, $pop8
+; FAST-NEXT:    global.set __stack_pointer, $pop7
+; FAST-NEXT:    i32.const $push5=, 12
+; FAST-NEXT:    i32.add $push6=, $2, $pop5
+; FAST-NEXT:    local.copy $push0=, $pop6
+; FAST-NEXT:    call $1=, stack_arg, $pop0
+; FAST-NEXT:    i32.const $push3=, 16
+; FAST-NEXT:    i32.add $push4=, $2, $pop3
+; FAST-NEXT:    global.set __stack_pointer, $pop4
+; FAST-NEXT:    return $1
   %a = alloca i32
   %v = tail call i32 @stack_arg(ptr %a)
   ret i32 %v
 }
 
-; CHECK-LABEL: stack_arg_gep:
-; CHECK: call
 define i32 @stack_arg_gep(ptr %x) {
+; SLOW-LABEL: stack_arg_gep:
+; SLOW:         .functype stack_arg_gep (i32) -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    global.get $push2=, __stack_pointer
+; SLOW-NEXT:    i32.const $push3=, 16
+; SLOW-NEXT:    i32.sub $push9=, $pop2, $pop3
+; SLOW-NEXT:    local.tee $push8=, $2=, $pop9
+; SLOW-NEXT:    global.set __stack_pointer, $pop8
+; SLOW-NEXT:    i32.const $push6=, 8
+; SLOW-NEXT:    i32.add $push7=, $2, $pop6
+; SLOW-NEXT:    i32.const $push0=, 4
+; SLOW-NEXT:    i32.or $push1=, $pop7, $pop0
+; SLOW-NEXT:    call $1=, stack_arg_gep, $pop1
+; SLOW-NEXT:    i32.const $push4=, 16
+; SLOW-NEXT:    i32.add $push5=, $2, $pop4
+; SLOW-NEXT:    global.set __stack_pointer, $pop5
+; SLOW-NEXT:    return $1
+;
+; FAST-LABEL: stack_arg_gep:
+; FAST:         .functype stack_arg_gep (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    global.get $push3=, __stack_pointer
+; FAST-NEXT:    i32.const $push4=, 16
+; FAST-NEXT:    i32.sub $push10=, $pop3, $pop4
+; FAST-NEXT:    local.tee $push9=, $2=, $pop10
+; FAST-NEXT:    global.set __stack_pointer, $pop9
+; FAST-NEXT:    i32.const $push7=, 8
+; FAST-NEXT:    i32.add $push8=, $2, $pop7
+; FAST-NEXT:    local.copy $push0=, $pop8
+; FAST-NEXT:    i32.const $push1=, 4
+; FAST-NEXT:    i32.add $push2=, $pop0, $pop1
+; FAST-NEXT:    call $1=, stack_arg_gep, $pop2
+; FAST-NEXT:    i32.const $push5=, 16
+; FAST-NEXT:    i32.add $push6=, $2, $pop5
+; FAST-NEXT:    global.set __stack_pointer, $pop6
+; FAST-NEXT:    return $1
   %a = alloca { i32, i32 }
   %p = getelementptr { i32, i32 }, ptr %a, i32 0, i32 1
   %v = tail call i32 @stack_arg_gep(ptr %p)
   ret i32 %v
 }
 
-; CHECK-LABEL: stack_arg_cast:
-; CHECK: global.get $push{{[0-9]+}}=, __stack_pointer
-; CHECK: global.set __stack_pointer, $pop{{[0-9]+}}
-; FAST: call ${{[0-9]+}}=, stack_arg_cast, $pop{{[0-9]+}}
-; CHECK: global.set __stack_pointer, $pop{{[0-9]+}}
-; SLOW: return_call stack_arg_cast, ${{[0-9]+}}
 define i32 @stack_arg_cast(i32 %x) {
+; SLOW-LABEL: stack_arg_cast:
+; SLOW:         .functype stack_arg_cast (i32) -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    global.get $push0=, __stack_pointer
+; SLOW-NEXT:    i32.const $push1=, 256
+; SLOW-NEXT:    i32.sub $push5=, $pop0, $pop1
+; SLOW-NEXT:    local.tee $push4=, $1=, $pop5
+; SLOW-NEXT:    global.set __stack_pointer, $pop4
+; SLOW-NEXT:    i32.const $push2=, 256
+; SLOW-NEXT:    i32.add $push3=, $1, $pop2
+; SLOW-NEXT:    global.set __stack_pointer, $pop3
+; SLOW-NEXT:    return_call stack_arg_cast, $1
+;
+; FAST-LABEL: stack_arg_cast:
+; FAST:         .functype stack_arg_cast (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    global.get $push1=, __stack_pointer
+; FAST-NEXT:    i32.const $push2=, 256
+; FAST-NEXT:    i32.sub $push6=, $pop1, $pop2
+; FAST-NEXT:    local.tee $push5=, $2=, $pop6
+; FAST-NEXT:    global.set __stack_pointer, $pop5
+; FAST-NEXT:    local.copy $push0=, $2
+; FAST-NEXT:    call $1=, stack_arg_cast, $pop0
+; FAST-NEXT:    i32.const $push3=, 256
+; FAST-NEXT:    i32.add $push4=, $2, $pop3
+; FAST-NEXT:    global.set __stack_pointer, $pop4
+; FAST-NEXT:    return $1
   %a = alloca [64 x i32]
   %i = ptrtoint ptr %a to i32
   %v = tail call i32 @stack_arg_cast(i32 %i)
@@ -252,6 +519,28 @@ define i32 @stack_arg_cast(i32 %x) {
 ; YAML-NEXT:    ReturnTypes:
 ; YAML-NEXT:      - I32
 define i32 @unique_caller(ptr %p) {
+; SLOW-LABEL: unique_caller:
+; SLOW:         .functype unique_caller (i32) -> (i32)
+; SLOW-NEXT:  # %bb.0:
+; SLOW-NEXT:    i32.const $push4=, 0
+; SLOW-NEXT:    f32.const $push3=, 0x0p0
+; SLOW-NEXT:    i64.const $push2=, 0
+; SLOW-NEXT:    f64.const $push1=, 0x0p0
+; SLOW-NEXT:    i32.load $push0=, 0($0)
+; SLOW-NEXT:    return_call_indirect , $pop4, $pop3, $pop2, $pop1, $pop0
+;
+; FAST-LABEL: unique_caller:
+; FAST:         .functype unique_caller (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push1=, 0
+; FAST-NEXT:    i32.const $push7=, 0
+; FAST-NEXT:    f32.convert_i32_s $push2=, $pop7
+; FAST-NEXT:    i64.const $push3=, 0
+; FAST-NEXT:    i32.const $push6=, 0
+; FAST-NEXT:    f64.convert_i32_s $push4=, $pop6
+; FAST-NEXT:    i32.load $push5=, 0($0)
+; FAST-NEXT:    call_indirect $push0=, $pop1, $pop2, $pop3, $pop4, $pop5 # Invalid depth argument!
+; FAST-NEXT:    return $pop0
   %f = load ptr, ptr %p
   %v = tail call i32 %f(i32 0, float 0., i64 0, double 0.)
   ret i32 %v

From 3f23c7f5bedc8786d3f4567d2331a7efcbb2a77e Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 21 Mar 2023 18:00:08 -0700
Subject: [PATCH 010/208] [InstSimplify] Actually use NewOps for calls in
 simplifyInstructionWithOperands

Resolves a TODO.

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D146599
---
 .../llvm/Analysis/InstructionSimplify.h       |   5 +-
 llvm/lib/Analysis/InstructionSimplify.cpp     | 118 +++++++++---------
 .../InstCombine/InstCombineCalls.cpp          |  10 +-
 llvm/unittests/Transforms/Utils/LocalTest.cpp |   3 +-
 4 files changed, 72 insertions(+), 64 deletions(-)

diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index 861fa3b20a495..826bd45d8057b 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -302,8 +302,9 @@ Value *simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 Value *simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, FastMathFlags FMF,
                      const SimplifyQuery &Q);
 
-/// Given a callsite, fold the result or return null.
-Value *simplifyCall(CallBase *Call, const SimplifyQuery &Q);
+/// Given a callsite, callee, and arguments, fold the result or return null.
+Value *simplifyCall(CallBase *Call, Value *Callee, ArrayRef<Value *> Args,
+                    const SimplifyQuery &Q);
 
 /// Given a constrained FP intrinsic call, tries to compute its simplified
 /// version. Returns a simplified result or null.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index ecb0cdbd13c62..eaf0af92484d7 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6391,10 +6391,13 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
   return nullptr;
 }
 
-static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
-
-  unsigned NumOperands = Call->arg_size();
-  Function *F = cast<Function>(Call->getCalledFunction());
+static Value *simplifyIntrinsic(CallBase *Call, Value *Callee,
+                                ArrayRef<Value *> Args,
+                                const SimplifyQuery &Q) {
+  // Operand bundles should not be in Args.
+  assert(Call->arg_size() == Args.size());
+  unsigned NumOperands = Args.size();
+  Function *F = cast<Function>(Callee);
   Intrinsic::ID IID = F->getIntrinsicID();
 
   // Most of the intrinsics with no operands have some kind of side effect.
@@ -6420,18 +6423,17 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
   }
 
   if (NumOperands == 1)
-    return simplifyUnaryIntrinsic(F, Call->getArgOperand(0), Q);
+    return simplifyUnaryIntrinsic(F, Args[0], Q);
 
   if (NumOperands == 2)
-    return simplifyBinaryIntrinsic(F, Call->getArgOperand(0),
-                                   Call->getArgOperand(1), Q);
+    return simplifyBinaryIntrinsic(F, Args[0], Args[1], Q);
 
   // Handle intrinsics with 3 or more arguments.
   switch (IID) {
   case Intrinsic::masked_load:
   case Intrinsic::masked_gather: {
-    Value *MaskArg = Call->getArgOperand(2);
-    Value *PassthruArg = Call->getArgOperand(3);
+    Value *MaskArg = Args[2];
+    Value *PassthruArg = Args[3];
     // If the mask is all zeros or undef, the "passthru" argument is the result.
     if (maskIsAllZeroOrUndef(MaskArg))
       return PassthruArg;
@@ -6439,8 +6441,7 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
   }
   case Intrinsic::fshl:
   case Intrinsic::fshr: {
-    Value *Op0 = Call->getArgOperand(0), *Op1 = Call->getArgOperand(1),
-          *ShAmtArg = Call->getArgOperand(2);
+    Value *Op0 = Args[0], *Op1 = Args[1], *ShAmtArg = Args[2];
 
     // If both operands are undef, the result is undef.
     if (Q.isUndefValue(Op0) && Q.isUndefValue(Op1))
@@ -6448,14 +6449,14 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
 
     // If shift amount is undef, assume it is zero.
     if (Q.isUndefValue(ShAmtArg))
-      return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1);
+      return Args[IID == Intrinsic::fshl ? 0 : 1];
 
     const APInt *ShAmtC;
     if (match(ShAmtArg, m_APInt(ShAmtC))) {
       // If there's effectively no shift, return the 1st arg or 2nd arg.
       APInt BitWidth = APInt(ShAmtC->getBitWidth(), ShAmtC->getBitWidth());
       if (ShAmtC->urem(BitWidth).isZero())
-        return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1);
+        return Args[IID == Intrinsic::fshl ? 0 : 1];
     }
 
     // Rotating zero by anything is zero.
@@ -6469,31 +6470,24 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
     return nullptr;
   }
   case Intrinsic::experimental_constrained_fma: {
-    Value *Op0 = Call->getArgOperand(0);
-    Value *Op1 = Call->getArgOperand(1);
-    Value *Op2 = Call->getArgOperand(2);
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    if (Value *V =
-            simplifyFPOp({Op0, Op1, Op2}, {}, Q, *FPI->getExceptionBehavior(),
-                         *FPI->getRoundingMode()))
+    if (Value *V = simplifyFPOp(Args, {}, Q, *FPI->getExceptionBehavior(),
+                                *FPI->getRoundingMode()))
       return V;
     return nullptr;
   }
   case Intrinsic::fma:
   case Intrinsic::fmuladd: {
-    Value *Op0 = Call->getArgOperand(0);
-    Value *Op1 = Call->getArgOperand(1);
-    Value *Op2 = Call->getArgOperand(2);
-    if (Value *V = simplifyFPOp({Op0, Op1, Op2}, {}, Q, fp::ebIgnore,
+    if (Value *V = simplifyFPOp(Args, {}, Q, fp::ebIgnore,
                                 RoundingMode::NearestTiesToEven))
       return V;
     return nullptr;
   }
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat: {
-    Value *Op0 = Call->getArgOperand(0);
-    Value *Op1 = Call->getArgOperand(1);
-    Value *Op2 = Call->getArgOperand(2);
+    Value *Op0 = Args[0];
+    Value *Op1 = Args[1];
+    Value *Op2 = Args[2];
     Type *ReturnType = F->getReturnType();
 
     // Canonicalize constant operand as Op1 (ConstantFolding handles the case
@@ -6520,9 +6514,9 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
     return nullptr;
   }
   case Intrinsic::vector_insert: {
-    Value *Vec = Call->getArgOperand(0);
-    Value *SubVec = Call->getArgOperand(1);
-    Value *Idx = Call->getArgOperand(2);
+    Value *Vec = Args[0];
+    Value *SubVec = Args[1];
+    Value *Idx = Args[2];
     Type *ReturnType = F->getReturnType();
 
     // (insert_vector Y, (extract_vector X, 0), 0) -> X
@@ -6539,51 +6533,52 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
   }
   case Intrinsic::experimental_constrained_fadd: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return simplifyFAddInst(
-        FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(),
-        Q, *FPI->getExceptionBehavior(), *FPI->getRoundingMode());
+    return simplifyFAddInst(Args[0], Args[1], FPI->getFastMathFlags(), Q,
+                            *FPI->getExceptionBehavior(),
+                            *FPI->getRoundingMode());
   }
   case Intrinsic::experimental_constrained_fsub: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return simplifyFSubInst(
-        FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(),
-        Q, *FPI->getExceptionBehavior(), *FPI->getRoundingMode());
+    return simplifyFSubInst(Args[0], Args[1], FPI->getFastMathFlags(), Q,
+                            *FPI->getExceptionBehavior(),
+                            *FPI->getRoundingMode());
   }
   case Intrinsic::experimental_constrained_fmul: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return simplifyFMulInst(
-        FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(),
-        Q, *FPI->getExceptionBehavior(), *FPI->getRoundingMode());
+    return simplifyFMulInst(Args[0], Args[1], FPI->getFastMathFlags(), Q,
+                            *FPI->getExceptionBehavior(),
+                            *FPI->getRoundingMode());
   }
   case Intrinsic::experimental_constrained_fdiv: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return simplifyFDivInst(
-        FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(),
-        Q, *FPI->getExceptionBehavior(), *FPI->getRoundingMode());
+    return simplifyFDivInst(Args[0], Args[1], FPI->getFastMathFlags(), Q,
+                            *FPI->getExceptionBehavior(),
+                            *FPI->getRoundingMode());
   }
   case Intrinsic::experimental_constrained_frem: {
     auto *FPI = cast<ConstrainedFPIntrinsic>(Call);
-    return simplifyFRemInst(
-        FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(),
-        Q, *FPI->getExceptionBehavior(), *FPI->getRoundingMode());
+    return simplifyFRemInst(Args[0], Args[1], FPI->getFastMathFlags(), Q,
+                            *FPI->getExceptionBehavior(),
+                            *FPI->getRoundingMode());
   }
   default:
     return nullptr;
   }
 }
 
-static Value *tryConstantFoldCall(CallBase *Call, const SimplifyQuery &Q) {
-  auto *F = dyn_cast<Function>(Call->getCalledOperand());
+static Value *tryConstantFoldCall(CallBase *Call, Value *Callee,
+                                  ArrayRef<Value *> Args,
+                                  const SimplifyQuery &Q) {
+  auto *F = dyn_cast<Function>(Callee);
   if (!F || !canConstantFoldCallTo(Call, F))
     return nullptr;
 
   SmallVector<Constant *, 4> ConstantArgs;
-  unsigned NumArgs = Call->arg_size();
-  ConstantArgs.reserve(NumArgs);
-  for (auto &Arg : Call->args()) {
-    Constant *C = dyn_cast<Constant>(&Arg);
+  ConstantArgs.reserve(Args.size());
+  for (Value *Arg : Args) {
+    Constant *C = dyn_cast<Constant>(Arg);
     if (!C) {
-      if (isa<MetadataAsValue>(Arg.get()))
+      if (isa<MetadataAsValue>(Arg))
         continue;
       return nullptr;
     }
@@ -6593,7 +6588,11 @@ static Value *tryConstantFoldCall(CallBase *Call, const SimplifyQuery &Q) {
   return ConstantFoldCall(Call, F, ConstantArgs, Q.TLI);
 }
 
-Value *llvm::simplifyCall(CallBase *Call, const SimplifyQuery &Q) {
+Value *llvm::simplifyCall(CallBase *Call, Value *Callee, ArrayRef<Value *> Args,
+                          const SimplifyQuery &Q) {
+  // Args should not contain operand bundle operands.
+  assert(Call->arg_size() == Args.size());
+
   // musttail calls can only be simplified if they are also DCEd.
   // As we can't guarantee this here, don't simplify them.
   if (Call->isMustTailCall())
@@ -6601,16 +6600,15 @@ Value *llvm::simplifyCall(CallBase *Call, const SimplifyQuery &Q) {
 
   // call undef -> poison
   // call null -> poison
-  Value *Callee = Call->getCalledOperand();
   if (isa<UndefValue>(Callee) || isa<ConstantPointerNull>(Callee))
     return PoisonValue::get(Call->getType());
 
-  if (Value *V = tryConstantFoldCall(Call, Q))
+  if (Value *V = tryConstantFoldCall(Call, Callee, Args, Q))
     return V;
 
   auto *F = dyn_cast<Function>(Callee);
   if (F && F->isIntrinsic())
-    if (Value *Ret = simplifyIntrinsic(Call, Q))
+    if (Value *Ret = simplifyIntrinsic(Call, Callee, Args, Q))
       return Ret;
 
   return nullptr;
@@ -6618,9 +6616,10 @@ Value *llvm::simplifyCall(CallBase *Call, const SimplifyQuery &Q) {
 
 Value *llvm::simplifyConstrainedFPCall(CallBase *Call, const SimplifyQuery &Q) {
   assert(isa<ConstrainedFPIntrinsic>(Call));
-  if (Value *V = tryConstantFoldCall(Call, Q))
+  SmallVector<Value *, 4> Args(Call->args());
+  if (Value *V = tryConstantFoldCall(Call, Call->getCalledOperand(), Args, Q))
     return V;
-  if (Value *Ret = simplifyIntrinsic(Call, Q))
+  if (Value *Ret = simplifyIntrinsic(Call, Call->getCalledOperand(), Args, Q))
     return Ret;
   return nullptr;
 }
@@ -6775,8 +6774,9 @@ static Value *simplifyInstructionWithOperands(Instruction *I,
   case Instruction::PHI:
     return simplifyPHINode(cast<PHINode>(I), NewOps, Q);
   case Instruction::Call:
-    // TODO: Use NewOps
-    return simplifyCall(cast<CallInst>(I), Q);
+    return simplifyCall(
+        cast<CallInst>(I), NewOps.back(),
+        NewOps.drop_back(1 + cast<CallInst>(I)->getNumTotalBundleOperands()), Q);
   case Instruction::Freeze:
     return llvm::simplifyFreezeInst(NewOps[0], Q);
 #define HANDLE_CAST_INST(num, opc, clas) case Instruction::opc:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 2b61b58dbc36a..0fbd62e8a41c0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1288,9 +1288,15 @@ foldShuffledIntrinsicOperands(IntrinsicInst *II,
 Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   // Don't try to simplify calls without uses. It will not do anything useful,
   // but will result in the following folds being skipped.
-  if (!CI.use_empty())
-    if (Value *V = simplifyCall(&CI, SQ.getWithInstruction(&CI)))
+  if (!CI.use_empty()) {
+    SmallVector<Value *, 4> Args;
+    Args.reserve(CI.arg_size());
+    for (Value *Op : CI.args())
+      Args.push_back(Op);
+    if (Value *V = simplifyCall(&CI, CI.getCalledOperand(), Args,
+                                SQ.getWithInstruction(&CI)))
       return replaceInstUsesWith(CI, V);
+  }
 
   if (Value *FreedOp = getFreedOperand(&CI, &TLI))
     return visitFree(CI, FreedOp);
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index d6b09b35f2caf..443f1f09915fd 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -598,7 +598,8 @@ TEST(Local, SimplifyVScaleWithRange) {
 
   // Test that simplifyCall won't try to query it's parent function for
   // vscale_range attributes in order to simplify llvm.vscale -> constant.
-  EXPECT_EQ(simplifyCall(CI, SimplifyQuery(M.getDataLayout())), nullptr);
+  EXPECT_EQ(simplifyCall(CI, VScale, {}, SimplifyQuery(M.getDataLayout())),
+            nullptr);
   delete CI;
 }
 

From d868135691bb0d5c924b8fd2ae26171fbf5d1387 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Tue, 21 Mar 2023 19:01:12 +0100
Subject: [PATCH 011/208] [libc++] Qualifies ptrdiff_t and max_align_t.

This has been done using the following commands
  find libcxx/test -type f -exec perl -pi -e 's|^([^/]+?)((?<!::)ptrdiff_t)|\1std::\2|' \{} \;
  find libcxx/test -type f -exec perl -pi -e 's|^([^/]+?)((?<!::)max_align_t)|\1std::\2|' \{} \;

The std module doesn't export declarations in the global namespaace.,
This is a preparation for that module.

Reviewed By: #libc, ldionne

Differential Revision: https://reviews.llvm.org/D146550
---
 .../libcxx/algorithms/debug_less.pass.cpp     |  2 +-
 .../numeric.ops/midpoint.integer.pass.cpp     |  2 +-
 .../ranges.find_first_of.pass.cpp             |  2 +-
 .../stdatomic.h.syn/types.compile.pass.cpp    |  2 +-
 libcxx/test/std/atomics/types.pass.cpp        |  4 +-
 .../splice_after_range.pass.cpp               | 46 +++++++++----------
 .../iterator.container/ssize.pass.cpp         |  2 +-
 .../move.iter.nonmember/iter_move.pass.cpp    |  2 +-
 .../move.iter.nonmember/iter_swap.pass.cpp    |  2 +-
 .../reverse.iter.cmp/sfinae.compile.pass.cpp  |  2 +-
 .../reverse.iter.nonmember/iter_move.pass.cpp |  6 +--
 .../reverse.iter.nonmember/iter_swap.pass.cpp |  6 +--
 .../max_align_t.compile.pass.cpp              |  2 +-
 .../bit/bit.pow.two/bit_ceil.pass.cpp         |  2 +-
 .../bit/bit.pow.two/bit_floor.pass.cpp        |  2 +-
 .../bit/bit.pow.two/bit_width.pass.cpp        |  2 +-
 .../bit/bit.pow.two/has_single_bit.pass.cpp   |  2 +-
 .../bit/bitops.count/countl_one.pass.cpp      |  2 +-
 .../bit/bitops.count/countl_zero.pass.cpp     |  2 +-
 .../bit/bitops.count/countr_one.pass.cpp      |  2 +-
 .../bit/bitops.count/countr_zero.pass.cpp     |  2 +-
 .../bit/bitops.count/popcount.pass.cpp        |  2 +-
 .../std/numerics/bit/bitops.rot/rotl.pass.cpp |  2 +-
 .../std/numerics/bit/bitops.rot/rotr.pass.cpp |  2 +-
 .../midpoint.integer.pass.cpp                 |  2 +-
 .../std/ranges/range.access/ssize.pass.cpp    |  2 +-
 .../constraints.compile.pass.cpp              |  2 +-
 .../range.lazy.split.inner/iter_move.pass.cpp |  2 +-
 .../range.lazy.split.inner/iter_swap.pass.cpp |  2 +-
 .../std/strings/string.view/types.pass.cpp    |  2 +-
 .../std/thread/thread.semaphore/max.pass.cpp  |  2 +-
 .../escaped_output.ascii.pass.cpp             | 12 ++---
 .../escaped_output.unicode.pass.cpp           | 12 ++---
 .../format_kind.compile.pass.cpp              |  2 +-
 .../tuple.include.ranges.pass.cpp             |  4 +-
 .../allocate_deallocate_bytes.pass.cpp        |  4 +-
 libcxx/test/support/test_iterators.h          |  6 +--
 37 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/debug_less.pass.cpp b/libcxx/test/libcxx/algorithms/debug_less.pass.cpp
index 4d6422a07a020..6085aa9c65d31 100644
--- a/libcxx/test/libcxx/algorithms/debug_less.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/debug_less.pass.cpp
@@ -207,7 +207,7 @@ void test_non_const_arg_cmp() {
 struct ValueIterator {
     typedef std::input_iterator_tag iterator_category;
     typedef std::size_t value_type;
-    typedef ptrdiff_t difference_type;
+    typedef std::ptrdiff_t difference_type;
     typedef std::size_t reference;
     typedef std::size_t* pointer;
 
diff --git a/libcxx/test/libcxx/numerics/numeric.ops/midpoint.integer.pass.cpp b/libcxx/test/libcxx/numerics/numeric.ops/midpoint.integer.pass.cpp
index 8df3f8ad8cade..302948756b198 100644
--- a/libcxx/test/libcxx/numerics/numeric.ops/midpoint.integer.pass.cpp
+++ b/libcxx/test/libcxx/numerics/numeric.ops/midpoint.integer.pass.cpp
@@ -62,7 +62,7 @@ int main(int, char**)
 #endif
 
     test<char>();
-    test<ptrdiff_t>();
+    test<std::ptrdiff_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.first.of/ranges.find_first_of.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.first.of/ranges.find_first_of.pass.cpp
index 5673c70c394b4..cea30420428cd 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.first.of/ranges.find_first_of.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.first.of/ranges.find_first_of.pass.cpp
@@ -71,7 +71,7 @@ template <int N1, int N2>
 struct Data {
   std::array<int, N1> input1;
   std::array<int, N2> input2;
-  ptrdiff_t expected;
+  std::ptrdiff_t expected;
 };
 
 template <class Iter1, class Sent1, class Iter2, class Sent2, int N1, int N2>
diff --git a/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp b/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
index 84f49a76a810f..3121e7c57bafe 100644
--- a/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
+++ b/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
@@ -203,7 +203,7 @@ void f() {
   static_assert(std::is_same_v<std::atomic<intptr_t>,  ::atomic_intptr_t>);
   static_assert(std::is_same_v<std::atomic<uintptr_t>, ::atomic_uintptr_t>);
   static_assert(std::is_same_v<std::atomic<std::size_t>,    ::atomic_size_t>);
-  static_assert(std::is_same_v<std::atomic<ptrdiff_t>, ::atomic_ptrdiff_t>);
+  static_assert(std::is_same_v<std::atomic<std::ptrdiff_t>, ::atomic_ptrdiff_t>);
   static_assert(std::is_same_v<std::atomic<intmax_t>,  ::atomic_intmax_t>);
   static_assert(std::is_same_v<std::atomic<uintmax_t>, ::atomic_uintmax_t>);
 
diff --git a/libcxx/test/std/atomics/types.pass.cpp b/libcxx/test/std/atomics/types.pass.cpp
index e0b617071c04f..63ab0f30c4a75 100644
--- a/libcxx/test/std/atomics/types.pass.cpp
+++ b/libcxx/test/std/atomics/types.pass.cpp
@@ -61,7 +61,7 @@ struct test_atomic<A*, false>
         A a; (void)a;
 #if TEST_STD_VER >= 17
     static_assert((std::is_same_v<typename A::value_type, decltype(a.load())>), "");
-    static_assert((std::is_same_v<typename A::difference_type, ptrdiff_t>), "");
+    static_assert((std::is_same_v<typename A::difference_type, std::ptrdiff_t>), "");
 #endif
     }
 };
@@ -149,7 +149,7 @@ int main(int, char**)
     test<intptr_t>  ();
     test<uintptr_t> ();
     test<std::size_t>    ();
-    test<ptrdiff_t> ();
+    test<std::ptrdiff_t> ();
     test<intmax_t>  ();
     test<uintmax_t> ();
 
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
index 527c72c1e7c8d..f2ab6e5faa19d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
@@ -19,21 +19,21 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-typedef ptrdiff_t T;
+typedef std::ptrdiff_t T;
 const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7};
 const T t2[] = {10, 11, 12, 13, 14, 15};
-const ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
-const ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
+const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
+const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
 
 template <class C>
 void
-testd(const C& c, ptrdiff_t p, ptrdiff_t f, ptrdiff_t l)
+testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l)
 {
     typename C::const_iterator i = c.begin();
-    ptrdiff_t n1 = 0;
+    std::ptrdiff_t n1 = 0;
     for (; n1 < p; ++n1, ++i)
         assert(*i == t1[n1]);
-    for (ptrdiff_t n2 = f; n2 < l-1; ++n2, ++i)
+    for (std::ptrdiff_t n2 = f; n2 < l-1; ++n2, ++i)
         assert(*i == t2[n2]);
     for (; n1 < size_t1; ++n1, ++i)
         assert(*i == t1[n1]);
@@ -42,11 +42,11 @@ testd(const C& c, ptrdiff_t p, ptrdiff_t f, ptrdiff_t l)
 
 template <class C>
 void
-tests(const C& c, ptrdiff_t p, ptrdiff_t f, ptrdiff_t l)
+tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l)
 {
     typename C::const_iterator i = c.begin();
-    ptrdiff_t n = 0;
-    ptrdiff_t d = l > f+1 ? l-1-f : 0;
+    std::ptrdiff_t n = 0;
+    std::ptrdiff_t d = l > f+1 ? l-1-f : 0;
     if (d == 0 || p == f)
     {
         for (n = 0; n < size_t1; ++n, ++i)
@@ -82,11 +82,11 @@ int main(int, char**)
     {
     // splicing different containers
     typedef std::forward_list<T> C;
-    for (ptrdiff_t f = 0; f <= size_t2+1; ++f)
+    for (std::ptrdiff_t f = 0; f <= size_t2+1; ++f)
     {
-        for (ptrdiff_t l = f; l <= size_t2+1; ++l)
+        for (std::ptrdiff_t l = f; l <= size_t2+1; ++l)
         {
-            for (ptrdiff_t p = 0; p <= size_t1; ++p)
+            for (std::ptrdiff_t p = 0; p <= size_t1; ++p)
             {
                 C c1(std::begin(t1), std::end(t1));
                 C c2(std::begin(t2), std::end(t2));
@@ -99,11 +99,11 @@ int main(int, char**)
     }
 
     // splicing within same container
-    for (ptrdiff_t f = 0; f <= size_t1+1; ++f)
+    for (std::ptrdiff_t f = 0; f <= size_t1+1; ++f)
     {
-        for (ptrdiff_t l = f; l <= size_t1; ++l)
+        for (std::ptrdiff_t l = f; l <= size_t1; ++l)
         {
-            for (ptrdiff_t p = 0; p <= f; ++p)
+            for (std::ptrdiff_t p = 0; p <= f; ++p)
             {
                 C c1(std::begin(t1), std::end(t1));
 
@@ -111,7 +111,7 @@ int main(int, char**)
                       std::next(c1.cbefore_begin(), f), std::next(c1.cbefore_begin(), l));
                 tests(c1, p, f, l);
             }
-            for (ptrdiff_t p = l; p <= size_t1; ++p)
+            for (std::ptrdiff_t p = l; p <= size_t1; ++p)
             {
                 C c1(std::begin(t1), std::end(t1));
 
@@ -126,11 +126,11 @@ int main(int, char**)
     {
     // splicing different containers
     typedef std::forward_list<T, min_allocator<T>> C;
-    for (ptrdiff_t f = 0; f <= size_t2+1; ++f)
+    for (std::ptrdiff_t f = 0; f <= size_t2+1; ++f)
     {
-        for (ptrdiff_t l = f; l <= size_t2+1; ++l)
+        for (std::ptrdiff_t l = f; l <= size_t2+1; ++l)
         {
-            for (ptrdiff_t p = 0; p <= size_t1; ++p)
+            for (std::ptrdiff_t p = 0; p <= size_t1; ++p)
             {
                 C c1(std::begin(t1), std::end(t1));
                 C c2(std::begin(t2), std::end(t2));
@@ -143,11 +143,11 @@ int main(int, char**)
     }
 
     // splicing within same container
-    for (ptrdiff_t f = 0; f <= size_t1+1; ++f)
+    for (std::ptrdiff_t f = 0; f <= size_t1+1; ++f)
     {
-        for (ptrdiff_t l = f; l <= size_t1; ++l)
+        for (std::ptrdiff_t l = f; l <= size_t1; ++l)
         {
-            for (ptrdiff_t p = 0; p <= f; ++p)
+            for (std::ptrdiff_t p = 0; p <= f; ++p)
             {
                 C c1(std::begin(t1), std::end(t1));
 
@@ -155,7 +155,7 @@ int main(int, char**)
                       std::next(c1.cbefore_begin(), f), std::next(c1.cbefore_begin(), l));
                 tests(c1, p, f, l);
             }
-            for (ptrdiff_t p = l; p <= size_t1; ++p)
+            for (std::ptrdiff_t p = l; p <= size_t1; ++p)
             {
                 C c1(std::begin(t1), std::end(t1));
 
diff --git a/libcxx/test/std/iterators/iterator.container/ssize.pass.cpp b/libcxx/test/std/iterators/iterator.container/ssize.pass.cpp
index 9be44094a099c..e7531aec12b24 100644
--- a/libcxx/test/std/iterators/iterator.container/ssize.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.container/ssize.pass.cpp
@@ -100,7 +100,7 @@ int main(int, char**)
     test_const_container ( sv );
 
     static constexpr int arrA [] { 1, 2, 3 };
-    ASSERT_SAME_TYPE(ptrdiff_t, decltype(std::ssize(arrA)));
+    ASSERT_SAME_TYPE(std::ptrdiff_t, decltype(std::ssize(arrA)));
     static_assert( std::is_signed_v<decltype(std::ssize(arrA))>, "");
     test_const_array ( arrA );
 
diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iter.ops/move.iter.nonmember/iter_move.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iter.ops/move.iter.nonmember/iter_move.pass.cpp
index a9e6d17720ab3..e0f63ac594367 100644
--- a/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iter.ops/move.iter.nonmember/iter_move.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iter.ops/move.iter.nonmember/iter_move.pass.cpp
@@ -28,7 +28,7 @@ template <bool IsNoexcept>
 struct MaybeNoexceptMove {
   int x;
   using value_type = int;
-  using difference_type = ptrdiff_t;
+  using difference_type = std::ptrdiff_t;
 
   constexpr friend value_type&& iter_move(MaybeNoexceptMove) noexcept(IsNoexcept) {
     return std::move(global);
diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iter.ops/move.iter.nonmember/iter_swap.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iter.ops/move.iter.nonmember/iter_swap.pass.cpp
index 075930dcb0a07..aa0815390391c 100644
--- a/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iter.ops/move.iter.nonmember/iter_swap.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iter.ops/move.iter.nonmember/iter_swap.pass.cpp
@@ -26,7 +26,7 @@
 template <bool IsNoexcept>
 struct MaybeNoexceptSwap {
   using value_type = int;
-  using difference_type = ptrdiff_t;
+  using difference_type = std::ptrdiff_t;
 
   constexpr friend void iter_swap(MaybeNoexceptSwap, MaybeNoexceptSwap) noexcept(IsNoexcept) {
   }
diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/sfinae.compile.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/sfinae.compile.pass.cpp
index 92e0e5cd8f9fa..8ead39231c0ba 100644
--- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/sfinae.compile.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/sfinae.compile.pass.cpp
@@ -49,7 +49,7 @@
 struct IterBase {
   using iterator_category = std::bidirectional_iterator_tag;
   using value_type = int;
-  using difference_type = ptrdiff_t;
+  using difference_type = std::ptrdiff_t;
   using pointer = int*;
   using reference = int&;
 
diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/iter_move.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/iter_move.pass.cpp
index 712425a0c44ff..5e35f5c8facc4 100644
--- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/iter_move.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/iter_move.pass.cpp
@@ -55,7 +55,7 @@ constexpr bool test() {
     {
       struct ThrowingCopyNoexceptDecrement {
         using value_type = int;
-        using difference_type = ptrdiff_t;
+        using difference_type = std::ptrdiff_t;
 
         ThrowingCopyNoexceptDecrement();
         ThrowingCopyNoexceptDecrement(const ThrowingCopyNoexceptDecrement&);
@@ -80,7 +80,7 @@ constexpr bool test() {
     {
       struct NoexceptCopyThrowingDecrement {
         using value_type = int;
-        using difference_type = ptrdiff_t;
+        using difference_type = std::ptrdiff_t;
 
         NoexceptCopyThrowingDecrement();
         NoexceptCopyThrowingDecrement(const NoexceptCopyThrowingDecrement&) noexcept;
@@ -105,7 +105,7 @@ constexpr bool test() {
     {
       struct NoexceptCopyAndDecrement {
         using value_type = int;
-        using difference_type = ptrdiff_t;
+        using difference_type = std::ptrdiff_t;
 
         NoexceptCopyAndDecrement();
         NoexceptCopyAndDecrement(const NoexceptCopyAndDecrement&) noexcept;
diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/iter_swap.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/iter_swap.pass.cpp
index d01ee2a1b85a1..7b6fb43b0001f 100644
--- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/iter_swap.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/iter_swap.pass.cpp
@@ -61,7 +61,7 @@ constexpr bool test() {
     {
       struct ThrowingCopyNoexceptDecrement {
         using value_type = int;
-        using difference_type = ptrdiff_t;
+        using difference_type = std::ptrdiff_t;
 
         ThrowingCopyNoexceptDecrement();
         ThrowingCopyNoexceptDecrement(const ThrowingCopyNoexceptDecrement&);
@@ -89,7 +89,7 @@ constexpr bool test() {
     {
       struct NoexceptCopyThrowingDecrement {
         using value_type = int;
-        using difference_type = ptrdiff_t;
+        using difference_type = std::ptrdiff_t;
 
         NoexceptCopyThrowingDecrement();
         NoexceptCopyThrowingDecrement(const NoexceptCopyThrowingDecrement&) noexcept;
@@ -117,7 +117,7 @@ constexpr bool test() {
     {
       struct NoexceptCopyAndDecrement {
         using value_type = int;
-        using difference_type = ptrdiff_t;
+        using difference_type = std::ptrdiff_t;
 
         NoexceptCopyAndDecrement();
         NoexceptCopyAndDecrement(const NoexceptCopyAndDecrement&) noexcept;
diff --git a/libcxx/test/std/language.support/support.types/max_align_t.compile.pass.cpp b/libcxx/test/std/language.support/support.types/max_align_t.compile.pass.cpp
index aef00bbc0ddac..5e335841c2a3c 100644
--- a/libcxx/test/std/language.support/support.types/max_align_t.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.types/max_align_t.compile.pass.cpp
@@ -26,5 +26,5 @@ static_assert(alignof(std::max_align_t) >= alignof(long double), "");
 static_assert(alignof(std::max_align_t) >= alignof(void*), "");
 #if TEST_STD_VER > 14
 static_assert(alignof(std::max_align_t) <= __STDCPP_DEFAULT_NEW_ALIGNMENT__,
-              "max_align_t alignment should be no larger than operator new's alignment");
+              "std::max_align_t alignment should be no larger than operator new's alignment");
 #endif
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
index 0da878d0b1f17..cbaf5dfbec19c 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
@@ -90,7 +90,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
index 41d5ed1e06017..473238be5e92c 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
@@ -87,7 +87,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
index 53fdec6bd2076..8110048e13960 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
@@ -92,7 +92,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
index 044a87fa41352..1c30f5cec5191 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
@@ -90,7 +90,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
index 06a8a6d6cc879..b236e37ee8791 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
@@ -87,7 +87,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
index 5a30e57c409a6..58c953f0b97a8 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
@@ -86,7 +86,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
index 1698e31086ea4..208e694e0282a 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
@@ -91,7 +91,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
index 91fbab8bbed84..0d14d9e71044b 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
@@ -88,7 +88,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
index 00e60ff588176..383338a2f21bd 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
@@ -98,7 +98,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
index 5fe63e47403c4..6cc1410eb682f 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
@@ -86,7 +86,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
index 1b63540cd8c45..b218bb0397335 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
@@ -87,7 +87,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
     static_assert(!std::is_invocable_v<L, intptr_t>);
-    static_assert(!std::is_invocable_v<L, ptrdiff_t>);
+    static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
     static_assert(!std::is_invocable_v<L, char>);
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.integer.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.integer.pass.cpp
index 03f3bdaf2d561..c506d0776a02c 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.integer.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.integer.pass.cpp
@@ -137,7 +137,7 @@ int main(int, char**)
 #endif
 
 //     int_test<char>();
-    signed_test<ptrdiff_t>();
+    signed_test<std::ptrdiff_t>();
     unsigned_test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/ranges/range.access/ssize.pass.cpp b/libcxx/test/std/ranges/range.access/ssize.pass.cpp
index a15dc344512a1..ac2c5b7b6b764 100644
--- a/libcxx/test/std/ranges/range.access/ssize.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/ssize.pass.cpp
@@ -71,7 +71,7 @@ constexpr bool test() {
   // This gets converted to ptrdiff_t because it's wider.
   ShortUnsignedReturnType c;
   assert(std::ranges::ssize(c) == 42);
-  ASSERT_SAME_TYPE(decltype(std::ranges::ssize(c)), ptrdiff_t);
+  ASSERT_SAME_TYPE(decltype(std::ranges::ssize(c)), std::ptrdiff_t);
 
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/constraints.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/constraints.compile.pass.cpp
index 122abe6315c11..a942f43904092 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/constraints.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/constraints.compile.pass.cpp
@@ -66,7 +66,7 @@ namespace test3 {
 
   struct AlmostInputIterator {
     using value_type = char;
-    using difference_type = ptrdiff_t;
+    using difference_type = std::ptrdiff_t;
     using iterator_concept = int;
 
     constexpr const char& operator*() const;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/iter_move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/iter_move.pass.cpp
index 3e5671936191e..f9d2b3e7f8950 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/iter_move.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/iter_move.pass.cpp
@@ -23,7 +23,7 @@ namespace adl {
 template <bool IsNoexcept = false>
 struct MaybeNoexceptIterator {
   using value_type = int;
-  using difference_type = ptrdiff_t;
+  using difference_type = std::ptrdiff_t;
 
   value_type* ptr_ = nullptr;
   int* iter_move_invocations_ = nullptr;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/iter_swap.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/iter_swap.pass.cpp
index 7d0e8a78caedf..18fd3a31ce23c 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/iter_swap.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/iter_swap.pass.cpp
@@ -24,7 +24,7 @@ namespace adl {
 template <bool IsNoexcept = false>
 struct MaybeNoexceptIterator {
   using value_type = int;
-  using difference_type = ptrdiff_t;
+  using difference_type = std::ptrdiff_t;
 
   value_type* ptr_ = nullptr;
   int* iter_swap_invocations_ = nullptr;
diff --git a/libcxx/test/std/strings/string.view/types.pass.cpp b/libcxx/test/std/strings/string.view/types.pass.cpp
index f952f20561230..25dc54d257409 100644
--- a/libcxx/test/std/strings/string.view/types.pass.cpp
+++ b/libcxx/test/std/strings/string.view/types.pass.cpp
@@ -47,7 +47,7 @@ test()
     static_assert((std::is_same<typename S::traits_type,     Traits>::value), "");
     static_assert((std::is_same<typename S::value_type,      typename Traits::char_type>::value), "");
     static_assert((std::is_same<typename S::size_type,       std::size_t>::value), "");
-    static_assert((std::is_same<typename S::difference_type, ptrdiff_t>::value), "");
+    static_assert((std::is_same<typename S::difference_type, std::ptrdiff_t>::value), "");
     static_assert((std::is_same<typename S::reference,             typename S::value_type&>::value), "");
     static_assert((std::is_same<typename S::const_reference, const typename S::value_type&>::value), "");
     static_assert((std::is_same<typename S::pointer,               typename S::value_type*>::value), "");
diff --git a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
index 5a3026bc351e9..ca7ad0c92e60e 100644
--- a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
@@ -21,6 +21,6 @@ int main(int, char**)
   static_assert(std::counting_semaphore<>::max() >= 1, "");
   static_assert(std::counting_semaphore<1>::max() >= 1, "");
   static_assert(std::counting_semaphore<std::numeric_limits<int>::max()>::max() >= std::numeric_limits<int>::max(), "");
-  static_assert(std::counting_semaphore<std::numeric_limits<ptrdiff_t>::max()>::max() == std::numeric_limits<ptrdiff_t>::max(), "");
+  static_assert(std::counting_semaphore<std::numeric_limits<std::ptrdiff_t>::max()>::max() == std::numeric_limits<ptrdiff_t>::max(), "");
   return 0;
 }
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp
index a3184e4cba4a0..5ceedf9f05c42 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp
@@ -109,7 +109,7 @@ auto test_format_to_n =
         std::size_t n = expected.size();
         std::basic_string<CharT> out(n, CharT(' '));
         std::format_to_n_result result = std::format_to_n(out.begin(), n, fmt, std::forward<Args>(args)...);
-        assert(result.size == static_cast<ptrdiff_t>(expected.size()));
+        assert(result.size == static_cast<std::ptrdiff_t>(expected.size()));
         assert(result.out == out.end());
         assert(out == expected);
       }
@@ -119,24 +119,24 @@ auto test_format_to_n =
         std::basic_string<CharT> out(n, CharT(' '));
         std::format_to_n_result result =
             std::format_to_n(out.begin(), n, std::locale(), fmt, std::forward<Args>(args)...);
-        assert(result.size == static_cast<ptrdiff_t>(expected.size()));
+        assert(result.size == static_cast<std::ptrdiff_t>(expected.size()));
         assert(result.out == out.end());
         assert(out == expected);
       }
 #endif // TEST_HAS_NO_LOCALIZATION
       {
-        ptrdiff_t n = 0;
+        std::ptrdiff_t n = 0;
         std::basic_string<CharT> out;
         std::format_to_n_result result = std::format_to_n(out.begin(), n, fmt, std::forward<Args>(args)...);
-        assert(result.size == static_cast<ptrdiff_t>(expected.size()));
+        assert(result.size == static_cast<std::ptrdiff_t>(expected.size()));
         assert(result.out == out.end());
         assert(out.empty());
       }
       {
-        ptrdiff_t n = expected.size() / 2;
+        std::ptrdiff_t n = expected.size() / 2;
         std::basic_string<CharT> out(n, CharT(' '));
         std::format_to_n_result result = std::format_to_n(out.begin(), n, fmt, std::forward<Args>(args)...);
-        assert(result.size == static_cast<ptrdiff_t>(expected.size()));
+        assert(result.size == static_cast<std::ptrdiff_t>(expected.size()));
         assert(result.out == out.end());
         assert(out == expected.substr(0, n));
       }
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index 0cb0257b8ea57..6d002a10c1479 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -115,7 +115,7 @@ auto test_format_to_n =
         std::size_t n = expected.size();
         std::basic_string<CharT> out(n, CharT(' '));
         std::format_to_n_result result = std::format_to_n(out.begin(), n, fmt, std::forward<Args>(args)...);
-        assert(result.size == static_cast<ptrdiff_t>(expected.size()));
+        assert(result.size == static_cast<std::ptrdiff_t>(expected.size()));
         assert(result.out == out.end());
         assert(out == expected);
       }
@@ -125,24 +125,24 @@ auto test_format_to_n =
         std::basic_string<CharT> out(n, CharT(' '));
         std::format_to_n_result result =
             std::format_to_n(out.begin(), n, std::locale(), fmt, std::forward<Args>(args)...);
-        assert(result.size == static_cast<ptrdiff_t>(expected.size()));
+        assert(result.size == static_cast<std::ptrdiff_t>(expected.size()));
         assert(result.out == out.end());
         assert(out == expected);
       }
 #endif // TEST_HAS_NO_LOCALIZATION
       {
-        ptrdiff_t n = 0;
+        std::ptrdiff_t n = 0;
         std::basic_string<CharT> out;
         std::format_to_n_result result = std::format_to_n(out.begin(), n, fmt, std::forward<Args>(args)...);
-        assert(result.size == static_cast<ptrdiff_t>(expected.size()));
+        assert(result.size == static_cast<std::ptrdiff_t>(expected.size()));
         assert(result.out == out.end());
         assert(out.empty());
       }
       {
-        ptrdiff_t n = expected.size() / 2;
+        std::ptrdiff_t n = expected.size() / 2;
         std::basic_string<CharT> out(n, CharT(' '));
         std::format_to_n_result result = std::format_to_n(out.begin(), n, fmt, std::forward<Args>(args)...);
-        assert(result.size == static_cast<ptrdiff_t>(expected.size()));
+        assert(result.size == static_cast<std::ptrdiff_t>(expected.size()));
         assert(result.out == out.end());
         assert(out == expected.substr(0, n));
       }
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
index d343ad1b1900b..7179a674a37ad 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
@@ -47,7 +47,7 @@ struct recursive_range {
   struct iterator {
     using iterator_concept = std::input_iterator_tag;
     using value_type       = recursive_range;
-    using difference_type  = ptrdiff_t;
+    using difference_type  = std::ptrdiff_t;
     using reference        = recursive_range;
 
     reference operator*() const;
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple.include.ranges.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple.include.ranges.pass.cpp
index 36f7745b7713f..716acbfdcebde 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple.include.ranges.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple.include.ranges.pass.cpp
@@ -22,8 +22,8 @@ using Iterator = int*;
 class SizedSentinel {
 public:
     constexpr bool operator==(int*) const;
-    friend constexpr ptrdiff_t operator-(const SizedSentinel&, int*);
-    friend constexpr ptrdiff_t operator-(int*, const SizedSentinel&);
+    friend constexpr std::ptrdiff_t operator-(const SizedSentinel&, int*);
+    friend constexpr std::ptrdiff_t operator-(int*, const SizedSentinel&);
 };
 
 static_assert(std::sized_sentinel_for<SizedSentinel, Iterator>);
diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/allocate_deallocate_bytes.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/allocate_deallocate_bytes.pass.cpp
index 0f5e2f0ae29ad..ee7e09ac1d655 100644
--- a/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/allocate_deallocate_bytes.pass.cpp
+++ b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/allocate_deallocate_bytes.pass.cpp
@@ -38,10 +38,10 @@ void test() {
     auto ptr                                      = static_cast<char*>(allocation);
     std::fill(ptr, ptr + 13, '0');
     assert(last_size == 13);
-    assert(last_alignment == alignof(max_align_t));
+    assert(last_alignment == alignof(std::max_align_t));
     allocator.deallocate_bytes(allocation, 13);
     assert(last_size == 13);
-    assert(last_alignment == alignof(max_align_t));
+    assert(last_alignment == alignof(std::max_align_t));
   }
   {
     void* allocation = allocator.allocate_bytes(13, 64);
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index b03687447c14b..f0e19fc3d5e62 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -455,7 +455,7 @@ TEST_CONSTEXPR Iter base(Iter i) { return i; }
 template <typename T>
 struct ThrowingIterator {
     typedef std::bidirectional_iterator_tag iterator_category;
-    typedef ptrdiff_t                       difference_type;
+    typedef std::ptrdiff_t                       difference_type;
     typedef const T                         value_type;
     typedef const T *                       pointer;
     typedef const T &                       reference;
@@ -566,7 +566,7 @@ struct ThrowingIterator {
 template <typename T>
 struct NonThrowingIterator {
     typedef std::bidirectional_iterator_tag iterator_category;
-    typedef ptrdiff_t                       difference_type;
+    typedef std::ptrdiff_t                       difference_type;
     typedef const T                         value_type;
     typedef const T *                       pointer;
     typedef const T &                       reference;
@@ -916,7 +916,7 @@ class Iterator {
  public:
   using value_type = int;
   using reference = int&;
-  using difference_type = ptrdiff_t;
+  using difference_type = std::ptrdiff_t;
 
  private:
   value_type* ptr_ = nullptr;

From dd0bbae5efa4d23322eda905b2f9e11dfd3c5d36 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 22 Mar 2023 09:28:47 -0700
Subject: [PATCH 012/208] [WebAssembly] Fix epilogue insertion for indirect
 tail calls

Previously epilogues were incorrectly inserted after indirect tail calls because
they did not have the `isTerminator` property. Add that property and test that
they get correct epilogues. To be safe, also add other properties that were
defined for direct tail calls.

Differential Revision: https://reviews.llvm.org/D146569
---
 .../WebAssembly/WebAssemblyInstrCall.td       |  2 +-
 llvm/test/CodeGen/WebAssembly/tailcall.ll     | 37 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 6a123f8f4030f..ca9a5ef9dda1c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -73,7 +73,7 @@ defm RET_CALL :
     "return_call    \t$callee", "return_call\t$callee", 0x12>,
   Requires<[HasTailCall]>;
 
-let isReturn = 1 in
+let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in
 defm RET_CALL_INDIRECT :
   I<(outs), (ins TypeIndex:$type, table32_op:$table, variable_ops),
     (outs), (ins TypeIndex:$type, table32_op:$table), [],
diff --git a/llvm/test/CodeGen/WebAssembly/tailcall.ll b/llvm/test/CodeGen/WebAssembly/tailcall.ll
index 34dd0a9a424b6..84bd142462e37 100644
--- a/llvm/test/CodeGen/WebAssembly/tailcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/tailcall.ll
@@ -507,6 +507,43 @@ define i32 @stack_arg_cast(i32 %x) {
   ret i32 %v
 }
 
+; Checks that epilogues are inserted after return calls.
+define i32 @direct_epilogue() {
+; CHECK-LABEL: direct_epilogue:
+; CHECK:         .functype direct_epilogue () -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-NEXT:    i32.const $push1=, 256
+; CHECK-NEXT:    i32.sub $push5=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push4=, $0=, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-NEXT:    i32.const $push2=, 256
+; CHECK-NEXT:    i32.add $push3=, $0, $pop2
+; CHECK-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-NEXT:    return_call direct_epilogue
+  %a = alloca [64 x i32]
+  %v = musttail call i32 @direct_epilogue()
+  ret i32 %v
+}
+
+define i32 @indirect_epilogue(ptr %p) {
+; CHECK-LABEL: indirect_epilogue:
+; CHECK:         .functype indirect_epilogue (i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-NEXT:    i32.const $push1=, 256
+; CHECK-NEXT:    i32.sub $push5=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push4=, $1=, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-NEXT:    i32.const $push2=, 256
+; CHECK-NEXT:    i32.add $push3=, $1, $pop2
+; CHECK-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-NEXT:    return_call_indirect , $0, $0
+  %a = alloca [64 x i32]
+  %v = musttail call i32 %p(ptr %p)
+  ret i32 %v
+}
+
 ; Check that the signatures generated for external indirectly
 ; return-called functions include the proper return types
 

From 8eb464f5433ae65bec3536ddb1195e5ff5c46af0 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Wed, 22 Mar 2023 14:48:28 +0000
Subject: [PATCH 013/208] [DebugInfo] Allow parsing line tables aligned to 4 or
 8-byte boundaries

This allows the DWARFDebugLine::SectionParser to try parsing line tables
at 4 or 8-byte boundaries if the unaligned offset appears invalid. If
aligning the offset does not reduce errors the offset is used unchanged.

This is needed for llvm-dwarfdump to be able to extract the line tables
(with --debug-lines) from binaries produced by certain compilers that
like to align each line table in the .debug_line section. Note that this
alignment does not seem to be invalid since the units do point to the
correct line table offsets via the DW_AT_stmt_list attribute.

Differential Revision: https://reviews.llvm.org/D143513
---
 .../llvm/DebugInfo/DWARF/DWARFDebugLine.h     |   1 +
 llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp   |  39 +++++
 .../llvm-dwarfdump/ARM/aligned_line_tables.s  | 152 ++++++++++++++++++
 3 files changed, 192 insertions(+)
 create mode 100644 llvm/test/tools/llvm-dwarfdump/ARM/aligned_line_tables.s

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index de9902ae2ebcb..5c01dad848fd2 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -355,6 +355,7 @@ class DWARFDebugLine {
   private:
     DWARFUnit *prepareToParse(uint64_t Offset);
     void moveToNextTable(uint64_t OldOffset, const Prologue &P);
+    bool hasValidVersion(uint64_t Offset);
 
     LineToUnitMap LineToUnit;
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 0725bd7744aea..dc46c76c06e86 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -1505,6 +1505,21 @@ DWARFUnit *DWARFDebugLine::SectionParser::prepareToParse(uint64_t Offset) {
   return U;
 }
 
+bool DWARFDebugLine::SectionParser::hasValidVersion(uint64_t Offset) {
+  DataExtractor::Cursor Cursor(Offset);
+  auto [TotalLength, _] = DebugLineData.getInitialLength(Cursor);
+  DWARFDataExtractor HeaderData(DebugLineData, Cursor.tell() + TotalLength);
+  uint16_t Version = HeaderData.getU16(Cursor);
+  if (!Cursor) {
+    // Ignore any error here.
+    // If this is not the end of the section parseNext() will still be
+    // attempted, where this error will occur again (and can be handled).
+    consumeError(Cursor.takeError());
+    return false;
+  }
+  return versionIsSupported(Version);
+}
+
 void DWARFDebugLine::SectionParser::moveToNextTable(uint64_t OldOffset,
                                                     const Prologue &P) {
   // If the length field is not valid, we don't know where the next table is, so
@@ -1518,5 +1533,29 @@ void DWARFDebugLine::SectionParser::moveToNextTable(uint64_t OldOffset,
   Offset = OldOffset + P.TotalLength + P.sizeofTotalLength();
   if (!DebugLineData.isValidOffset(Offset)) {
     Done = true;
+    return;
+  }
+
+  // Heuristic: If the version is valid, then this is probably a line table.
+  // Otherwise, the offset might need alignment (to a 4 or 8 byte boundary).
+  if (hasValidVersion(Offset))
+    return;
+
+  // ARM C/C++ Compiler aligns each line table to word boundaries and pads out
+  // the .debug_line section to a word multiple. Note that in the specification
+  // this does not seem forbidden since each unit has a DW_AT_stmt_list.
+  for (unsigned Align : {4, 8}) {
+    uint64_t AlignedOffset = alignTo(Offset, Align);
+    if (!DebugLineData.isValidOffset(AlignedOffset)) {
+      // This is almost certainly not another line table but some alignment
+      // padding. This assumes the alignments tested are ordered, and are
+      // smaller than the header size (which is true for 4 and 8).
+      Done = true;
+      return;
+    }
+    if (hasValidVersion(AlignedOffset)) {
+      Offset = AlignedOffset;
+      break;
+    }
   }
 }
diff --git a/llvm/test/tools/llvm-dwarfdump/ARM/aligned_line_tables.s b/llvm/test/tools/llvm-dwarfdump/ARM/aligned_line_tables.s
new file mode 100644
index 0000000000000..f59ce7aa9f774
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/ARM/aligned_line_tables.s
@@ -0,0 +1,152 @@
+// RUN: llvm-mc %s -defsym ALIGN_4=1 -save-temp-labels -filetype obj -triple arm-none-eabi -o %t.o
+// RUN: llvm-nm %t.o | FileCheck %s --check-prefix=L4
+// RUN: llvm-dwarfdump -debug-line %t.o 2>&1 | FileCheck %s --implicit-check-not='warning:' --check-prefix=MULT4
+
+// RUN: llvm-mc %s -defsym ALIGN_8=1 -save-temp-labels -filetype obj -triple arm-none-eabi -o %t.o
+// RUN: llvm-nm %t.o | FileCheck %s --check-prefix=L8
+// RUN: llvm-dwarfdump -debug-line %t.o 2>&1 | FileCheck %s --implicit-check-not='warning:' --check-prefix=MULT8
+
+// RUN: llvm-mc %s -defsym UNALIGNED_PADDING=1 -save-temp-labels -filetype obj -triple arm-none-eabi -o %t.o
+// RUN: llvm-nm %t.o | FileCheck %s --check-prefix=LUNALIGN
+// RUN: llvm-dwarfdump -debug-line %t.o 2>&1 | FileCheck %s --check-prefix=UNALIGN
+
+/// This test is based on a real example from ARM C/C++ Compiler.
+/// It verifies llvm-dwarfdump is able to dump line tables even if they've been
+/// placed at aligned offsets.
+
+// L4: 0000002b N .Ltable0_end
+// MULT4:      Address            Line   Column File   ISA Discriminator Flags
+// MULT4-NEXT: ------------------ ------ ------ ------ --- ------------- -------------
+// MULT4-NEXT: 0x0000000000000000      1      0      1   0             0  is_stmt end_sequence
+// MULT4-EMPTY:
+// MULT4-NEXT: debug_line[0x0000002c]
+// MULT4-NEXT: Line table prologue:
+// MULT4-NEXT:    total_length: 0x0000003a{{$}}
+// MULT4-NEXT:          format: DWARF32
+// MULT4-NEXT:         version: 2{{$}}
+// MULT4-NEXT: prologue_length: 0x0000001a
+// MULT4-NEXT: min_inst_length: 2
+// MULT4-NEXT: default_is_stmt: 1
+
+// L8: 00000027 N .Ltable0_end
+// MULT8:      Address            Line   Column File   ISA Discriminator Flags
+// MULT8-NEXT: ------------------ ------ ------ ------ --- ------------- -------------
+// MULT8-NEXT: 0x0000000000000000      1      0      1   0             0  is_stmt end_sequence
+// MULT8-EMPTY:
+// MULT8-NEXT: debug_line[0x00000028]
+// MULT8-NEXT: Line table prologue:
+// MULT8-NEXT:    total_length: 0x0000003a{{$}}
+// MULT8-NEXT:          format: DWARF32
+// MULT8-NEXT:         version: 2{{$}}
+// MULT8-NEXT: prologue_length: 0x0000001a
+// MULT8-NEXT: min_inst_length: 2
+// MULT8-NEXT: default_is_stmt: 1
+
+/// This should fail to dump:
+// LUNALIGN: 00000027 N .Ltable0_end
+// UNALIGN: warning: parsing line table prologue at offset 0x00000027: unsupported version
+
+.section .debug_line
+/// First line table
+/// Unit total length:
+.long .Ltable0_end - .Ltable0_start
+.Ltable0_start:
+.short 2        /// Version
+/// Header length:
+.long .Ltable0_header_end - .Ltable0_header_start
+.Ltable0_header_start:
+.byte 4         /// Min instruction length
+.byte 1         /// Max operations per instruction
+.byte 0         /// Default is statement
+.byte 6         /// Line range
+.byte 10        /// Opcode base
+.byte 0         /// standard_opcode_lengths[DW_LNS_copy] = 0
+.byte 1         /// standard_opcode_lengths[DW_LNS_advance_pc] = 1
+.byte 1         /// standard_opcode_lengths[DW_LNS_advance_line] = 1
+.byte 1         /// standard_opcode_lengths[DW_LNS_set_file] = 1
+.byte 1         /// standard_opcode_lengths[DW_LNS_set_column] = 1
+.byte 0         /// standard_opcode_lengths[DW_LNS_negate_stmt] = 0
+.byte 0         /// standard_opcode_lengths[DW_LNS_set_basic_block] = 0
+.byte 0         /// standard_opcode_lengths[DW_LNS_const_add_pc] = 0
+.byte 0         /// standard_opcode_lengths[DW_LNS_fixed_advance_pc] = 0
+.byte 0         /// No include directories
+/// File name:
+.ifdef ALIGN_4
+/// Pad out filename so next 4 byte aligned offset is a multiple of 4 and not 8.
+.asciz "foobar.cpp"
+.else
+.asciz "test.c"
+.endif
+.byte 0         /// Dir idx
+.byte 0         /// Mod time
+.byte 0         /// Length
+.byte 0         /// End files
+.Ltable0_header_end:
+/// Line table operations
+.byte 0         /// Extended opcode
+.byte 1         /// Length 1
+.byte 1         /// DW_LNE_end_sequence
+.Ltable0_end:
+/// End first line table
+/// Padding:
+.ifdef UNALIGNED_PADDING
+.short 0
+.else
+.byte 0
+.endif
+/// Second line table
+/// Unit total length:
+.long .Ltable1_end - .Ltable1_start
+.Ltable1_start:
+.short 2        /// Version
+/// Header length:
+.long .Ltable1_header_end - .Ltable1_header_start
+.Ltable1_header_start:
+.byte 2         /// Min instruction length
+.byte 1         /// Max operations per instruction
+.byte 0         /// Default is statement
+.byte 6         /// Line range
+.byte 10        /// Opcode base
+.byte 0         /// standard_opcode_lengths[DW_LNS_copy] = 0
+.byte 1         /// standard_opcode_lengths[DW_LNS_advance_pc] = 1
+.byte 1         /// standard_opcode_lengths[DW_LNS_advance_line] = 1
+.byte 1         /// standard_opcode_lengths[DW_LNS_set_file] = 1
+.byte 1         /// standard_opcode_lengths[DW_LNS_set_column] = 1
+.byte 0         /// standard_opcode_lengths[DW_LNS_negate_stmt] = 0
+.byte 0         /// standard_opcode_lengths[DW_LNS_set_basic_block] = 0
+.byte 0         /// standard_opcode_lengths[DW_LNS_const_add_pc] = 0
+.byte 0         /// standard_opcode_lengths[DW_LNS_fixed_advance_pc] = 0
+.byte 0         /// No include directories
+.asciz "test.c" /// File name
+.byte 0         /// Dir idx
+.byte 0         /// Mod time
+.byte 0         /// Length
+.byte 0         /// End files
+.Ltable1_header_end:
+/// Line table operations
+.byte 4         /// DW_LNS_set_file
+.byte 1         /// File 1
+.byte 5         /// DW_LNS_set_column
+.byte 1         /// Column 1
+.byte 0         /// Extended opcode
+.byte 5         /// Length 5
+.byte 2         /// DW_LNE_set_address
+.long 32896     /// Address = 0x00008080
+.byte 3         /// DW_LNS_advance_line
+.byte 6         /// Line += 6
+.byte 1         /// DW_LNS_copy
+.byte 5         /// DW_LNS_set_column
+.byte 2         /// Column 2
+.byte 12        /// Special opcode (address += 0,  line += 2)
+.byte 30        /// Special opcode (address += 6,  line += 2)
+.byte 5         /// DW_LNS_set_column
+.byte 1         /// Column 1
+.byte 17        /// Special opcode (address += 2,  line += 1)
+.byte 2         /// DW_LNS_advance_pc
+.byte 4         /// += (4 * min instruction length)
+.byte 0         /// Extended opcode
+.byte 1         /// Length 1
+.byte 1         /// DW_LNE_end_sequence
+.Ltable1_end:
+/// End second line table
+.short 0        /// Padding (to make section a word multiple)

From 83e420c65f4a6c0b693af82cfd81ae58fd033f97 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 22 Mar 2023 09:31:51 -0700
Subject: [PATCH 014/208] [Constant] Inline ConstantInt::getSigned

ConstantInt::getSigned calls ConstantInt::get with the IsSigned flag set to true.
That flag normally defaults to false.

For always signed constants the code base is not consistent about whether
it uses ConstantInt::getSigned or ConstantInt::get with IsSigned set to true.
And it's not clear how to decide which way to use.

By making getSigned inline, both ways should generate the same code in
the end.

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D146598
---
 llvm/include/llvm/IR/Constants.h | 8 ++++++--
 llvm/lib/IR/Constants.cpp        | 8 --------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 9cc56ecf8e970..baa4bac8c8e14 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -111,8 +111,12 @@ class ConstantInt final : public ConstantData {
   /// either getSExtValue() or getZExtValue() will yield a correctly sized and
   /// signed value for the type Ty.
   /// Get a ConstantInt for a specific signed value.
-  static ConstantInt *getSigned(IntegerType *Ty, int64_t V);
-  static Constant *getSigned(Type *Ty, int64_t V);
+  static ConstantInt *getSigned(IntegerType *Ty, int64_t V) {
+    return get(Ty, V, true);
+  }
+  static Constant *getSigned(Type *Ty, int64_t V) {
+    return get(Ty, V, true);
+  }
 
   /// Return a ConstantInt with the specified value and an implied Type. The
   /// type is the integer type that corresponds to the bit width of the value.
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index ba68e6be05b52..a4b00d92ea89a 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -899,14 +899,6 @@ ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, bool isSigned) {
   return get(Ty->getContext(), APInt(Ty->getBitWidth(), V, isSigned));
 }
 
-ConstantInt *ConstantInt::getSigned(IntegerType *Ty, int64_t V) {
-  return get(Ty, V, true);
-}
-
-Constant *ConstantInt::getSigned(Type *Ty, int64_t V) {
-  return get(Ty, V, true);
-}
-
 Constant *ConstantInt::get(Type *Ty, const APInt& V) {
   ConstantInt *C = get(Ty->getContext(), V);
   assert(C->getType() == Ty->getScalarType() &&

From 9e3ca7987a4dc33cdf847b79a6304b117651d21f Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 22 Mar 2023 00:54:15 +0000
Subject: [PATCH 015/208] [mlir][tosa] Canonicalize concatenate->slice sequence

Adds a canonicalizer for the concatenate->slice sequence where
an output of slice can be replaced with an input of concatenate.

This is useful in the context of operations with complex inputs
and outputs that are legalized from a framework such as TFL.
For example, a TFL graph (FFT->FFT) will be legalized to the
following TOSA graph:

     <complex input>
         /     \
     slice    slice
         \     /
           FFT
          /   \     -+
       concatenate   |
         /     \     |  Redundant
     slice    slice  |
         \     /    -+
           FFT
         /     \
       concatenate
            |
     <complex output>

Concatenate and slice operations at the boundaries of the graph are
useful as they maintain the correct correspondance of input/output
tensors to the original TFL graph. However, consecutive
complex operations will result in redundant concatenate->slice
sequences which should be removed from the final TOSA graph.

The canonicalization does not currently handle dynamic types.

Signed-off-by: Luke Hutton <luke.hutton@arm.com>

Reviewed By: rsuderman

Differential Revision: https://reviews.llvm.org/D144545
---
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td  |  1 +
 .../Dialect/Tosa/IR/TosaCanonicalizations.cpp | 59 +++++++++++++++++++
 mlir/test/Dialect/Tosa/canonicalize.mlir      | 53 +++++++++++++++++
 3 files changed, 113 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 7c8018ad64606..b6127f1ffa3cf 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -1556,6 +1556,7 @@ def Tosa_SliceOp: Tosa_Op<"slice", [
     Tosa_Tensor1Dto6D:$output
   );
 
+  let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
 
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 1a8a5782e11f6..16f23e4798c02 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -519,6 +519,65 @@ void ClampOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<ClampClampOptimization>(context);
 }
 
+struct ConcatSliceOptimization : public OpRewritePattern<tosa::SliceOp> {
+  using OpRewritePattern<tosa::SliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tosa::SliceOp sliceOp,
+                                PatternRewriter &rewriter) const override {
+    Value sliceInput = sliceOp.getInput();
+    auto concatOp = sliceInput.getDefiningOp<tosa::ConcatOp>();
+    if (!concatOp)
+      return rewriter.notifyMatchFailure(
+          sliceOp, "slice input must be concat operation");
+
+    OperandRange inputs = concatOp.getInput1();
+    auto concatType = dyn_cast<RankedTensorType>(concatOp.getType());
+    if (!concatType || !concatType.hasStaticShape())
+      return rewriter.notifyMatchFailure(
+          sliceOp, "slice input must be a static ranked tensor");
+    int32_t axis = concatOp.getAxis();
+
+    llvm::SmallVector<int64_t> sliceStart(sliceOp.getStart());
+    llvm::ArrayRef<int64_t> sliceSize = sliceOp.getSize();
+
+    // Validate slice on the concatenated axis. Slicing along this
+    // axis should span only one of the inputs to the concatenate
+    // operation.
+    std::optional<Value> replaceWithSlice;
+    for (auto input : inputs) {
+      auto inputType = dyn_cast<RankedTensorType>(input.getType());
+      if (!inputType || !inputType.hasStaticShape())
+        return rewriter.notifyMatchFailure(
+            sliceOp, "concat input must be a static ranked tensor");
+
+      if (sliceStart[axis] >= 0 &&
+          (sliceStart[axis] + sliceSize[axis]) <= inputType.getDimSize(axis)) {
+        replaceWithSlice =
+            rewriter
+                .create<tosa::SliceOp>(
+                    sliceOp.getLoc(), sliceOp.getType(), input,
+                    rewriter.getDenseI64ArrayAttr(sliceOp.getStart()),
+                    rewriter.getDenseI64ArrayAttr(sliceSize))
+                .getResult();
+        break;
+      }
+      sliceStart[axis] -= inputType.getDimSize(axis);
+    }
+
+    if (!replaceWithSlice)
+      return rewriter.notifyMatchFailure(
+          sliceOp, "corresponding concat input not found for slice");
+
+    rewriter.replaceOp(sliceOp, replaceWithSlice.value());
+    return success();
+  }
+};
+
+void SliceOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                          MLIRContext *context) {
+  results.add<ConcatSliceOptimization>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // Operator Folders.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index e16a614c7cd01..77627d8c8ba62 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -434,3 +434,56 @@ func.func @fold_resize_bilinear(%arg0 : tensor<1x15x13x1xi8>) -> tensor<1x15x13x
   %resize = "tosa.resize"(%arg0) {mode = "BILINEAR", scale = array<i64: 2, 2, 1, 1>, offset = array<i64: 0, 0>, border = array<i64: 0, 0>} : (tensor<1x15x13x1xi8>) -> tensor<1x15x13x1xi8>
   return %resize : tensor<1x15x13x1xi8>
 }
+
+// -----
+
+// CHECK-LABEL: @canonicalize_concat_slice_final_axis
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x12x12x1xf32>, %[[VAL_1:.*]]: tensor<1x12x12x1xf32>
+// CHECK: return %[[VAL_0]], %[[VAL_1]] : tensor<1x12x12x1xf32>, tensor<1x12x12x1xf32>
+func.func @canonicalize_concat_slice_final_axis(%arg0 : tensor<1x12x12x1xf32>, %arg1 : tensor<1x12x12x1xf32>) -> (tensor<1x12x12x1xf32>, tensor<1x12x12x1xf32>) {
+  %0 = "tosa.concat"(%arg0, %arg1) {axis = 3 : i64} : (tensor<1x12x12x1xf32>, tensor<1x12x12x1xf32>) -> tensor<1x12x12x2xf32>
+  %1 = "tosa.slice"(%0) {size = array<i64: 1, 12, 12, 1>, start = array<i64: 0, 0, 0, 0>} : (tensor<1x12x12x2xf32>) -> tensor<1x12x12x1xf32>
+  %2 = "tosa.slice"(%0) {size = array<i64: 1, 12, 12, 1>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x12x12x2xf32>) -> tensor<1x12x12x1xf32>
+  return %1, %2 : tensor<1x12x12x1xf32>, tensor<1x12x12x1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @canonicalize_concat_slice_middle_axis
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x12x12xf32>, %[[VAL_1:.*]]: tensor<1x12x12xf32>
+// CHECK: return %[[VAL_0]], %[[VAL_1]] : tensor<1x12x12xf32>, tensor<1x12x12xf32>
+func.func @canonicalize_concat_slice_middle_axis(%arg0 : tensor<1x12x12xf32>, %arg1 : tensor<1x12x12xf32>) -> (tensor<1x12x12xf32>, tensor<1x12x12xf32>) {
+  %0 = "tosa.concat"(%arg0, %arg1) {axis = 1 : i64} : (tensor<1x12x12xf32>, tensor<1x12x12xf32>) -> tensor<1x24x12xf32>
+  %1 = "tosa.slice"(%0) {size = array<i64: 1, 12, 12>, start = array<i64: 0, 0, 0>} : (tensor<1x24x12xf32>) -> tensor<1x12x12xf32>
+  %2 = "tosa.slice"(%0) {size = array<i64: 1, 12, 12>, start = array<i64: 0, 12, 0>} : (tensor<1x24x12xf32>) -> tensor<1x12x12xf32>
+  return %1, %2 : tensor<1x12x12xf32>, tensor<1x12x12xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @canonicalize_cross_concat_inputs
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x12x12xf32>, %[[VAL_1:.*]]: tensor<1x12x12xf32>
+// CHECK: %[[VAL_2:.*]] = "tosa.concat"(%[[VAL_0]], %[[VAL_1]]) {axis = 2 : i64} : (tensor<1x12x12xf32>, tensor<1x12x12xf32>) -> tensor<1x12x24xf32>
+// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) {size = array<i64: 1, 12, 15>, start = array<i64: 0, 0, 0>} : (tensor<1x12x24xf32>) -> tensor<1x12x15xf32>
+// CHECK: %[[VAL_4:.*]] = "tosa.slice"(%[[VAL_2]]) {size = array<i64: 1, 12, 20>, start = array<i64: 0, 0, 4>} : (tensor<1x12x24xf32>) -> tensor<1x12x20xf32>
+// CHECK: return %[[VAL_3]], %[[VAL_4]] : tensor<1x12x15xf32>, tensor<1x12x20xf32>
+func.func @canonicalize_cross_concat_inputs(%arg0 : tensor<1x12x12xf32>, %arg1 : tensor<1x12x12xf32>) -> (tensor<1x12x15xf32>, tensor<1x12x20xf32>) {
+  %0 = "tosa.concat"(%arg0, %arg1) {axis = 2 : i64} : (tensor<1x12x12xf32>, tensor<1x12x12xf32>) -> tensor<1x12x24xf32>
+  %1 = "tosa.slice"(%0) {size = array<i64: 1, 12, 15>, start = array<i64: 0, 0, 0>} : (tensor<1x12x24xf32>) -> tensor<1x12x15xf32>
+  %2 = "tosa.slice"(%0) {size = array<i64: 1, 12, 20>, start = array<i64: 0, 0, 4>} : (tensor<1x12x24xf32>) -> tensor<1x12x20xf32>
+  return %1, %2 : tensor<1x12x15xf32>, tensor<1x12x20xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @canonicalize_concat_slice_on_non_concat_axis
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x12x12xf32>, %[[VAL_1:.*]]: tensor<1x12x12xf32>
+// CHECK: %[[VAL_2:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 6, 12>, start = array<i64: 0, 0, 0>} : (tensor<1x12x12xf32>) -> tensor<1x6x12xf32>
+// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_1]]) {size = array<i64: 1, 3, 12>, start = array<i64: 1, 3, 12>} : (tensor<1x12x12xf32>) -> tensor<1x3x12xf32>
+// CHECK: return %[[VAL_2]], %[[VAL_3]] : tensor<1x6x12xf32>, tensor<1x3x12xf32>
+func.func @canonicalize_concat_slice_on_non_concat_axis(%arg0 : tensor<1x12x12xf32>, %arg1 : tensor<1x12x12xf32>) -> (tensor<1x6x12xf32>, tensor<1x3x12xf32>) {
+  %0 = "tosa.concat"(%arg0, %arg1) {axis = 2 : i64} : (tensor<1x12x12xf32>, tensor<1x12x12xf32>) -> tensor<1x12x24xf32>
+  %1 = "tosa.slice"(%0) {size = array<i64: 1, 6, 12>, start = array<i64: 0, 0, 0>} : (tensor<1x12x24xf32>) -> tensor<1x6x12xf32>
+  %2 = "tosa.slice"(%0) {size = array<i64: 1, 3, 12>, start = array<i64: 1, 3, 12>} : (tensor<1x12x24xf32>) -> tensor<1x3x12xf32>
+  return %1, %2 : tensor<1x6x12xf32>, tensor<1x3x12xf32>
+}

From 164b046ebfa8d7ad36ce567e2214c97e4e7b1657 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 22 Mar 2023 10:01:17 -0700
Subject: [PATCH 016/208] [RISCV] Convert segment registers to VR registers in
 RISCVMCInstLower.

Similar to what we do for the LMUL>1 register classes. The printing
is only working today because the segment registers have "ABI" names
set to their base register name.
---
 llvm/lib/Target/RISCV/RISCVMCInstLower.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index 281918259cdb3..6b658539a319b 100644
--- a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -193,6 +193,19 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
       } else if (RISCV::FPR64RegClass.contains(Reg)) {
         Reg = TRI->getSubReg(Reg, RISCV::sub_32);
         assert(Reg && "Superregister does not exist");
+      } else if (RISCV::VRN2M1RegClass.contains(Reg) ||
+                 RISCV::VRN2M2RegClass.contains(Reg) ||
+                 RISCV::VRN2M4RegClass.contains(Reg) ||
+                 RISCV::VRN3M1RegClass.contains(Reg) ||
+                 RISCV::VRN3M2RegClass.contains(Reg) ||
+                 RISCV::VRN4M1RegClass.contains(Reg) ||
+                 RISCV::VRN4M2RegClass.contains(Reg) ||
+                 RISCV::VRN5M1RegClass.contains(Reg) ||
+                 RISCV::VRN6M1RegClass.contains(Reg) ||
+                 RISCV::VRN7M1RegClass.contains(Reg) ||
+                 RISCV::VRN8M1RegClass.contains(Reg)) {
+        Reg = TRI->getSubReg(Reg, RISCV::sub_vrm1_0);
+        assert(Reg && "Subregister does not exist");
       }
 
       MCOp = MCOperand::createReg(Reg);

From a67e989cd2a730ea778102f2a0d965daed0182bd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 22 Mar 2023 10:07:18 -0700
Subject: [PATCH 017/208] [RISCV] Add FallbackRegAltNameIndex to ABIRegAltName.

Remove now redundant fake ABI names from vector registers.

This also fixes a crash that occurs if you use fflags as an instruction
operand in the assembly and use -debug. It's not a valid register
for any instruction since this wouldn't be common. It doesn't have
an ABI name so it crashes the register printing in the debug output.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 70 ++++++++++------------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 301f2ad77d00c..7e91441e91f47 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -45,6 +45,7 @@ class RISCVReg64<RISCVReg32 subreg>
   let SubRegIndices = [sub_32];
 }
 
+let FallbackRegAltNameIndex = NoRegAltName in
 def ABIRegAltName : RegAltNameIndex;
 
 def sub_vrm4_0 : SubRegIndex<256>;
@@ -415,51 +416,46 @@ class VRegList<list<dag> LIn, int start, int nf, int lmul, bit isV0> {
 }
 
 // Vector registers
-let RegAltNameIndices = [ABIRegAltName] in {
-  foreach Index = 0-31 in {
-    def V#Index : RISCVReg<Index, "v"#Index, ["v"#Index]>, DwarfRegNum<[!add(Index, 96)]>;
-  }
+foreach Index = 0-31 in {
+  def V#Index : RISCVReg<Index, "v"#Index>, DwarfRegNum<[!add(Index, 96)]>;
+}
 
-  foreach Index = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
-                   24, 26, 28, 30] in {
-    def V#Index#M2 : RISCVRegWithSubRegs<Index, "v"#Index,
-                       [!cast<Register>("V"#Index),
-                        !cast<Register>("V"#!add(Index, 1))],
-                       ["v"#Index]>,
-                     DwarfRegAlias<!cast<Register>("V"#Index)> {
-      let SubRegIndices = [sub_vrm1_0, sub_vrm1_1];
-    }
+foreach Index = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
+                 24, 26, 28, 30] in {
+  def V#Index#M2 : RISCVRegWithSubRegs<Index, "v"#Index,
+                     [!cast<Register>("V"#Index),
+                      !cast<Register>("V"#!add(Index, 1))]>,
+                   DwarfRegAlias<!cast<Register>("V"#Index)> {
+    let SubRegIndices = [sub_vrm1_0, sub_vrm1_1];
   }
+}
 
-  foreach Index = [0, 4, 8, 12, 16, 20, 24, 28] in {
-    def V#Index#M4 : RISCVRegWithSubRegs<Index, "v"#Index,
-                       [!cast<Register>("V"#Index#"M2"),
-                        !cast<Register>("V"#!add(Index, 2)#"M2")],
-                       ["v"#Index]>,
-                     DwarfRegAlias<!cast<Register>("V"#Index)> {
-      let SubRegIndices = [sub_vrm2_0, sub_vrm2_1];
-    }
+foreach Index = [0, 4, 8, 12, 16, 20, 24, 28] in {
+  def V#Index#M4 : RISCVRegWithSubRegs<Index, "v"#Index,
+                     [!cast<Register>("V"#Index#"M2"),
+                      !cast<Register>("V"#!add(Index, 2)#"M2")]>,
+                   DwarfRegAlias<!cast<Register>("V"#Index)> {
+    let SubRegIndices = [sub_vrm2_0, sub_vrm2_1];
   }
+}
 
-  foreach Index = [0, 8, 16, 24] in {
-    def V#Index#M8 : RISCVRegWithSubRegs<Index, "v"#Index,
-                       [!cast<Register>("V"#Index#"M4"),
-                        !cast<Register>("V"#!add(Index, 4)#"M4")],
-                       ["v"#Index]>,
-                     DwarfRegAlias<!cast<Register>("V"#Index)> {
-      let SubRegIndices = [sub_vrm4_0, sub_vrm4_1];
-    }
+foreach Index = [0, 8, 16, 24] in {
+  def V#Index#M8 : RISCVRegWithSubRegs<Index, "v"#Index,
+                     [!cast<Register>("V"#Index#"M4"),
+                      !cast<Register>("V"#!add(Index, 4)#"M4")]>,
+                   DwarfRegAlias<!cast<Register>("V"#Index)> {
+    let SubRegIndices = [sub_vrm4_0, sub_vrm4_1];
   }
-
-  def VTYPE  : RISCVReg<0, "vtype", ["vtype"]>;
-  def VL     : RISCVReg<0, "vl", ["vl"]>;
-  def VXSAT  : RISCVReg<0, "vxsat", ["vxsat"]>;
-  def VXRM   : RISCVReg<0, "vxrm", ["vxrm"]>;
-  let isConstant = true in
-  def VLENB  : RISCVReg<0, "vlenb", ["vlenb"]>,
-               DwarfRegNum<[!add(4096, SysRegVLENB.Encoding)]>;
 }
 
+def VTYPE  : RISCVReg<0, "vtype">;
+def VL     : RISCVReg<0, "vl">;
+def VXSAT  : RISCVReg<0, "vxsat">;
+def VXRM   : RISCVReg<0, "vxrm">;
+let isConstant = true in
+def VLENB  : RISCVReg<0, "vlenb">,
+             DwarfRegNum<[!add(4096, SysRegVLENB.Encoding)]>;
+
 def VCSR : RegisterClass<"RISCV", [XLenVT], 32,
                           (add VTYPE, VL, VLENB)> {
   let RegInfos = XLenRI;

From 700cd99061edeeba7b657e32acca940225fa25ae Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 22 Mar 2023 10:01:45 -0700
Subject: [PATCH 018/208] Restore "[MemProf] Context disambiguation cloning
 pass [patch 1a/3]"

This restores commit d6ad4f01c3dafcab335bca66dac6e36d9eac8421, which was
reverted in commit 883dbb9c86be87593a58ef10b070b3a0564c7fee, along with
a fix for gcc 12.2 build errors in the original commit.

Support for building, printing, and displaying CallsiteContextGraph
which represents the MemProf metadata contexts. Uses CRTP to enable
support for both IR (regular LTO) and summary (ThinLTO). This patch
includes the support for building it in regular LTO mode (from
memprof and callsite metadata), and the next patch will add the
handling for building it from ThinLTO summaries.

Also includes support for dumping the graph to text and to dot files.

Follow-on patches will contain the support for cloning on the graph and
in the IR.

The graph represents the call contexts in all memprof metadata on
allocation calls, with nodes for the allocations themselves, as well as
for the calls in each context. The graph is initially built from the
allocation memprof metadata (or summary) MIBs. It is then updated to
match calls with callsite metadata onto the nodes, updating it to
reflect any inlining performed on those calls.

Each MIB (representing an allocation's call context with allocation
behavior) is assigned a unique context id during the graph build. The
edges and nodes in the graph are decorated with the context ids they
carry. This is used to correctly update the graph when cloning is
performed so that we can uniquify the context for a single (possibly
cloned) allocation.

Differential Revision: https://reviews.llvm.org/D140908
---
 .../IPO/MemProfContextDisambiguation.h        |   38 +
 llvm/lib/Passes/PassBuilder.cpp               |    1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   11 +
 llvm/lib/Passes/PassRegistry.def              |    1 +
 llvm/lib/Transforms/IPO/CMakeLists.txt        |    1 +
 .../IPO/MemProfContextDisambiguation.cpp      | 1583 +++++++++++++++++
 llvm/test/ThinLTO/X86/memprof-summary.ll      |  184 --
 .../MemProfContextDisambiguation/basic.ll     |  158 ++
 .../duplicate-context-ids.ll                  |  232 +++
 .../duplicate-context-ids2.ll                 |  386 ++++
 .../indirectcall.ll                           |  261 +++
 .../MemProfContextDisambiguation/inlined.ll   |  189 ++
 .../MemProfContextDisambiguation/inlined2.ll  |  135 ++
 .../pass-pipeline.ll                          |   41 +
 14 files changed, 3037 insertions(+), 184 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
 create mode 100644 llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
 delete mode 100644 llvm/test/ThinLTO/X86/memprof-summary.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids2.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/pass-pipeline.ll

diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
new file mode 100644
index 0000000000000..56e56ed67f7df
--- /dev/null
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -0,0 +1,38 @@
+//==- MemProfContextDisambiguation.h - Context Disambiguation ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements support for context disambiguation of allocation calls for profile
+// guided heap optimization using memprof metadata. See implementation file for
+// details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
+#define LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Module;
+
+class MemProfContextDisambiguation
+    : public PassInfoMixin<MemProfContextDisambiguation> {
+  /// Run the context disambiguator on \p M, returns true if any changes made.
+  bool processModule(Module &M);
+
+public:
+  MemProfContextDisambiguation() {}
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 89d2e6a4b2d1a..a04f8bbaa5dc0 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -117,6 +117,7 @@
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/LoopExtractor.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
 #include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 1d386139d9e6c..aaabe23049288 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -57,6 +57,7 @@
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 #include "llvm/Transforms/IPO/Inliner.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
 #include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
@@ -271,6 +272,10 @@ static cl::opt<AttributorRunOption> AttributorRun(
                clEnumValN(AttributorRunOption::NONE, "none",
                           "disable attributor runs")));
 
+cl::opt<bool> EnableMemProfContextDisambiguation(
+    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
+    cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -1709,6 +1714,12 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
       InlineContext{ThinOrFullLTOPhase::FullLTOPostLink,
                     InlinePass::CGSCCInliner}));
 
+  // Perform context disambiguation after inlining, since that would reduce the
+  // amount of additional cloning required to distinguish the allocation
+  // contexts.
+  if (EnableMemProfContextDisambiguation)
+    MPM.addPass(MemProfContextDisambiguation());
+
   // Optimize globals again after we ran the inliner.
   MPM.addPass(GlobalOptPass());
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 04d648580a8c5..82592a1ee9b55 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -87,6 +87,7 @@ MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
 MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
 MODULE_PASS("partial-inliner", PartialInlinerPass())
+MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
 MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
 MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
 MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 063a9a60d0cb5..e03aff0f65d7a 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_component_library(LLVMipo
   Internalize.cpp
   LoopExtractor.cpp
   LowerTypeTests.cpp
+  MemProfContextDisambiguation.cpp
   MergeFunctions.cpp
   ModuleInliner.cpp
   OpenMPOpt.cpp
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
new file mode 100644
index 0000000000000..5a6625743eecf
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -0,0 +1,1583 @@
+//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements support for context disambiguation of allocation
+// calls for profile guided heap optimization. Specifically, it uses Memprof
+// profiles which indicate context specific allocation behavior (currently
+// distinguishing cold vs hot memory allocations). Cloning is performed to
+// expose the cold allocation call contexts, and the allocation calls are
+// subsequently annotated with an attribute for later transformation.
+//
+// The transformations can be performed either directly on IR (regular LTO), or
+// (eventually) on a ThinLTO index (later applied to the IR during the ThinLTO
+// backend). Both types of LTO operate on a the same base graph representation,
+// which uses CRTP to support either IR or Index formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/MemoryProfileInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include <sstream>
+#include <vector>
+using namespace llvm;
+using namespace llvm::memprof;
+
+#define DEBUG_TYPE "memprof-context-disambiguation"
+
+static cl::opt<std::string> DotFilePathPrefix(
+    "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
+    cl::value_desc("filename"),
+    cl::desc("Specify the path prefix of the MemProf dot files."));
+
+static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
+                                 cl::Hidden,
+                                 cl::desc("Export graph to dot files."));
+
+static cl::opt<bool>
+    DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
+            cl::desc("Dump CallingContextGraph to stdout after each stage."));
+
+static cl::opt<bool>
+    VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
+              cl::desc("Perform verification checks on CallingContextGraph."));
+
+static cl::opt<bool>
+    VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
+                cl::desc("Perform frequent verification checks on nodes."));
+
+inline bool hasSingleAllocType(uint8_t AllocTypes) {
+  switch (AllocTypes) {
+  case (uint8_t)AllocationType::Cold:
+  case (uint8_t)AllocationType::NotCold:
+    return true;
+    break;
+  case (uint8_t)AllocationType::None:
+    assert(false);
+    break;
+  default:
+    return false;
+    break;
+  }
+  llvm_unreachable("invalid alloc type");
+}
+
+/// CRTP base for graphs built from either IR or ThinLTO summary index.
+///
+/// The graph represents the call contexts in all memprof metadata on allocation
+/// calls, with nodes for the allocations themselves, as well as for the calls
+/// in each context. The graph is initially built from the allocation memprof
+/// metadata (or summary) MIBs. It is then updated to match calls with callsite
+/// metadata onto the nodes, updating it to reflect any inlining performed on
+/// those calls.
+///
+/// Each MIB (representing an allocation's call context with allocation
+/// behavior) is assigned a unique context id during the graph build. The edges
+/// and nodes in the graph are decorated with the context ids they carry. This
+/// is used to correctly update the graph when cloning is performed so that we
+/// can uniquify the context for a single (possibly cloned) allocation.
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+class CallsiteContextGraph {
+public:
+  CallsiteContextGraph() = default;
+  CallsiteContextGraph(const CallsiteContextGraph &) = default;
+  CallsiteContextGraph(CallsiteContextGraph &&) = default;
+
+  /// Main entry point to perform analysis and transformations on graph.
+  bool process();
+
+  void dump() const;
+  void print(raw_ostream &OS) const;
+
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const CallsiteContextGraph &CCG) {
+    CCG.print(OS);
+    return OS;
+  }
+
+  friend struct GraphTraits<
+      const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
+  friend struct DOTGraphTraits<
+      const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
+
+  void exportToDot(std::string Label) const;
+
+  /// Represents a function clone via FuncTy pointer and clone number pair.
+  struct FuncInfo final
+      : public std::pair<FuncTy *, unsigned /*Clone number*/> {
+    using Base = std::pair<FuncTy *, unsigned>;
+    FuncInfo(const Base &B) : Base(B) {}
+    FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
+    explicit operator bool() const { return this->first != nullptr; }
+    FuncTy *func() const { return this->first; }
+    unsigned cloneNo() const { return this->second; }
+  };
+
+  /// Represents a callsite clone via CallTy and clone number pair.
+  struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
+    using Base = std::pair<CallTy, unsigned>;
+    CallInfo(const Base &B) : Base(B) {}
+    CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
+        : Base(Call, CloneNo) {}
+    explicit operator bool() const { return (bool)this->first; }
+    CallTy call() const { return this->first; }
+    unsigned cloneNo() const { return this->second; }
+    void setCloneNo(unsigned N) { this->second = N; }
+    void print(raw_ostream &OS) const {
+      if (!operator bool()) {
+        assert(!cloneNo());
+        OS << "null Call";
+        return;
+      }
+      call()->print(OS);
+      OS << "\t(clone " << cloneNo() << ")";
+    }
+    void dump() const {
+      print(dbgs());
+      dbgs() << "\n";
+    }
+    friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
+      Call.print(OS);
+      return OS;
+    }
+  };
+
+  struct ContextEdge;
+
+  /// Node in the Callsite Context Graph
+  struct ContextNode {
+    // Keep this for now since in the IR case where we have an Instruction* it
+    // is not as immediately discoverable. Used for printing richer information
+    // when dumping graph.
+    bool IsAllocation;
+
+    // Keeps track of when the Call was reset to null because there was
+    // recursion.
+    bool Recursive = false;
+
+    // The corresponding allocation or interior call.
+    CallInfo Call;
+
+    // For alloc nodes this is a unique id assigned when constructed, and for
+    // callsite stack nodes it is the original stack id when the node is
+    // constructed from the memprof MIB metadata on the alloc nodes. Note that
+    // this is only used when matching callsite metadata onto the stack nodes
+    // created when processing the allocation memprof MIBs, and for labeling
+    // nodes in the dot graph. Therefore we don't bother to assign a value for
+    // clones.
+    uint64_t OrigStackOrAllocId = 0;
+
+    // This will be formed by ORing together the AllocationType enum values
+    // for contexts including this node.
+    uint8_t AllocTypes = 0;
+
+    // Edges to all callees in the profiled call stacks.
+    // TODO: Should this be a map (from Callee node) for more efficient lookup?
+    std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
+
+    // Edges to all callers in the profiled call stacks.
+    // TODO: Should this be a map (from Caller node) for more efficient lookup?
+    std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
+
+    // The set of IDs for contexts including this node.
+    DenseSet<uint32_t> ContextIds;
+
+    // List of clones of this ContextNode, initially empty.
+    std::vector<ContextNode *> Clones;
+
+    // If a clone, points to the original uncloned node.
+    ContextNode *CloneOf = nullptr;
+
+    ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
+
+    ContextNode(bool IsAllocation, CallInfo C)
+        : IsAllocation(IsAllocation), Call(C) {}
+
+    std::unique_ptr<ContextNode> clone() {
+      auto Clone = std::make_unique<ContextNode>(IsAllocation, Call);
+      if (CloneOf) {
+        CloneOf->Clones.push_back(Clone.get());
+        Clone->CloneOf = CloneOf;
+      } else {
+        Clones.push_back(Clone.get());
+        Clone->CloneOf = this;
+      }
+      return Clone;
+    }
+
+    ContextNode *getOrigNode() {
+      if (!CloneOf)
+        return this;
+      return CloneOf;
+    }
+
+    void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
+                               unsigned int ContextId);
+
+    ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
+    ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
+    void eraseCalleeEdge(const ContextEdge *Edge);
+    void eraseCallerEdge(const ContextEdge *Edge);
+
+    void setCall(CallInfo C) { Call = C; }
+
+    bool hasCall() const { return (bool)Call.call(); }
+
+    void printCall(raw_ostream &OS) const { Call.print(OS); }
+
+    // True if this node was effectively removed from the graph, in which case
+    // its context id set, caller edges, and callee edges should all be empty.
+    bool isRemoved() const {
+      assert(ContextIds.empty() ==
+             (CalleeEdges.empty() && CallerEdges.empty()));
+      return ContextIds.empty();
+    }
+
+    void dump() const;
+    void print(raw_ostream &OS) const;
+
+    friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
+      Node.print(OS);
+      return OS;
+    }
+  };
+
+  /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
+  /// callee.
+  struct ContextEdge {
+    ContextNode *Callee;
+    ContextNode *Caller;
+
+    // This will be formed by ORing together the AllocationType enum values
+    // for contexts including this edge.
+    uint8_t AllocTypes = 0;
+
+    // The set of IDs for contexts including this edge.
+    DenseSet<uint32_t> ContextIds;
+
+    ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
+                DenseSet<uint32_t> ContextIds)
+        : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
+          ContextIds(ContextIds) {}
+
+    DenseSet<uint32_t> &getContextIds() { return ContextIds; }
+
+    void dump() const;
+    void print(raw_ostream &OS) const;
+
+    friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
+      Edge.print(OS);
+      return OS;
+    }
+  };
+
+protected:
+  /// Get a list of nodes corresponding to the stack ids in the given callsite
+  /// context.
+  template <class NodeT, class IteratorT>
+  std::vector<uint64_t>
+  getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
+
+  /// Adds nodes for the given allocation and any stack ids on its memprof MIB
+  /// metadata (or summary).
+  ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
+
+  /// Adds nodes for the given MIB stack ids.
+  template <class NodeT, class IteratorT>
+  void addStackNodesForMIB(ContextNode *AllocNode,
+                           CallStack<NodeT, IteratorT> &StackContext,
+                           CallStack<NodeT, IteratorT> &CallsiteContext,
+                           AllocationType AllocType);
+
+  /// Matches all callsite metadata (or summary) to the nodes created for
+  /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
+  /// inlining performed on those callsite instructions.
+  void updateStackNodes();
+
+  /// Update graph to conservatively handle any callsite stack nodes that target
+  /// multiple different callee target functions.
+  void handleCallsitesWithMultipleTargets();
+
+  /// Save lists of calls with MemProf metadata in each function, for faster
+  /// iteration.
+  std::vector<std::pair<FuncTy *, std::vector<CallInfo>>>
+      FuncToCallsWithMetadata;
+
+  /// Map from callsite node to the enclosing caller function.
+  std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
+
+private:
+  using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
+
+  using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
+                                     const FuncTy *, DenseSet<uint32_t>>;
+
+  /// Assigns the given Node to calls at or inlined into the location with
+  /// the Node's stack id, after post order traversing and processing its
+  /// caller nodes. Uses the call information recorded in the given
+  /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
+  /// as needed. Called by updateStackNodes which sets up the given
+  /// StackIdToMatchingCalls map.
+  void assignStackNodesPostOrder(
+      ContextNode *Node, DenseSet<const ContextNode *> &Visited,
+      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
+
+  /// Duplicates the given set of context ids, updating the provided
+  /// map from each original id with the newly generated context ids,
+  /// and returning the new duplicated id set.
+  DenseSet<uint32_t> duplicateContextIds(
+      const DenseSet<uint32_t> &StackSequenceContextIds,
+      DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
+
+  /// Propagates all duplicated context ids across the graph.
+  void propagateDuplicateContextIds(
+      const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
+
+  /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
+  /// else to its callers. Also updates OrigNode's edges to remove any context
+  /// ids moved to the newly created edge.
+  void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
+                      bool TowardsCallee);
+
+  /// Get the stack id corresponding to the given Id or Index (for IR this will
+  /// return itself, for a summary index this will return the id recorded in the
+  /// index for that stack id index value).
+  uint64_t getStackId(uint64_t IdOrIndex) const {
+    return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
+  }
+
+  /// Returns true if the given call targets the given function.
+  bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) {
+    return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(Call, Func);
+  }
+
+  /// Get a list of nodes corresponding to the stack ids in the given
+  /// callsite's context.
+  std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
+    return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
+        Call);
+  }
+
+  /// Get the last stack id in the context for callsite.
+  uint64_t getLastStackId(CallTy Call) {
+    return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
+  }
+
+  /// Gets a label to use in the dot graph for the given call clone in the given
+  /// function.
+  std::string getLabel(const FuncTy *Func, const CallTy Call,
+                       unsigned CloneNo) const {
+    return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
+  }
+
+  /// Helpers to find the node corresponding to the given call or stackid.
+  ContextNode *getNodeForInst(const CallInfo &C);
+  ContextNode *getNodeForAlloc(const CallInfo &C);
+  ContextNode *getNodeForStackId(uint64_t StackId);
+
+  /// Removes the node information recorded for the given call.
+  void unsetNodeForInst(const CallInfo &C);
+
+  /// Computes the alloc type corresponding to the given context ids, by
+  /// unioning their recorded alloc types.
+  uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
+
+  /// Map from each context ID to the AllocationType assigned to that context.
+  std::map<uint32_t, AllocationType> ContextIdToAllocationType;
+
+  /// Identifies the context node created for a stack id when adding the MIB
+  /// contexts to the graph. This is used to locate the context nodes when
+  /// trying to assign the corresponding callsites with those stack ids to these
+  /// nodes.
+  std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
+
+  /// Maps to track the calls to their corresponding nodes in the graph.
+  std::map<const CallInfo, ContextNode *> AllocationCallToContextNodeMap;
+  std::map<const CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
+
+  /// Owner of all ContextNode unique_ptrs.
+  std::vector<std::unique_ptr<ContextNode>> NodeOwner;
+
+  /// Perform sanity checks on graph when requested.
+  void check() const;
+
+  /// Keeps track of the last unique context id assigned.
+  unsigned int LastContextId = 0;
+};
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+using ContextNode =
+    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+using ContextEdge =
+    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+using FuncInfo =
+    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+using CallInfo =
+    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
+
+/// CRTP derived class for graphs built from IR (regular LTO).
+class ModuleCallsiteContextGraph
+    : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
+                                  Instruction *> {
+public:
+  ModuleCallsiteContextGraph(Module &M);
+
+private:
+  friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
+                              Instruction *>;
+
+  uint64_t getStackId(uint64_t IdOrIndex) const;
+  bool calleeMatchesFunc(Instruction *Call, const Function *Func);
+  uint64_t getLastStackId(Instruction *Call);
+  std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
+  std::string getLabel(const Function *Func, const Instruction *Call,
+                       unsigned CloneNo) const;
+
+  const Module &Mod;
+};
+
+namespace {
+
+struct FieldSeparator {
+  bool Skip = true;
+  const char *Sep;
+
+  FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
+};
+
+raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
+  if (FS.Skip) {
+    FS.Skip = false;
+    return OS;
+  }
+  return OS << FS.Sep;
+}
+
+} // end anonymous namespace
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
+CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
+    const CallInfo &C) {
+  ContextNode *Node = getNodeForAlloc(C);
+  if (Node)
+    return Node;
+
+  auto NonAllocCallNode = NonAllocationCallToContextNodeMap.find(C);
+  if (NonAllocCallNode != NonAllocationCallToContextNodeMap.end()) {
+    return NonAllocCallNode->second;
+  }
+  return nullptr;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
+CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
+    const CallInfo &C) {
+  auto AllocCallNode = AllocationCallToContextNodeMap.find(C);
+  if (AllocCallNode != AllocationCallToContextNodeMap.end()) {
+    return AllocCallNode->second;
+  }
+  return nullptr;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
+CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
+    uint64_t StackId) {
+  auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
+  if (StackEntryNode != StackEntryIdToContextNodeMap.end())
+    return StackEntryNode->second;
+  return nullptr;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
+    const CallInfo &C) {
+  AllocationCallToContextNodeMap.erase(C) ||
+      NonAllocationCallToContextNodeMap.erase(C);
+  assert(!AllocationCallToContextNodeMap.count(C) &&
+         !NonAllocationCallToContextNodeMap.count(C));
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
+    addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
+                          unsigned int ContextId) {
+  for (auto &Edge : CallerEdges) {
+    if (Edge->Caller == Caller) {
+      Edge->AllocTypes |= (uint8_t)AllocType;
+      Edge->getContextIds().insert(ContextId);
+      return;
+    }
+  }
+  std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
+      this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
+  CallerEdges.push_back(Edge);
+  Caller->CalleeEdges.push_back(Edge);
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
+CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
+    findEdgeFromCallee(const ContextNode *Callee) {
+  for (const auto &Edge : CalleeEdges)
+    if (Edge->Callee == Callee)
+      return Edge.get();
+  return nullptr;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
+CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
+    findEdgeFromCaller(const ContextNode *Caller) {
+  for (const auto &Edge : CallerEdges)
+    if (Edge->Caller == Caller)
+      return Edge.get();
+  return nullptr;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
+    eraseCalleeEdge(const ContextEdge *Edge) {
+  auto EI =
+      std::find_if(CalleeEdges.begin(), CalleeEdges.end(),
+                   [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
+                     return CalleeEdge.get() == Edge;
+                   });
+  assert(EI != CalleeEdges.end());
+  CalleeEdges.erase(EI);
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
+    eraseCallerEdge(const ContextEdge *Edge) {
+  auto EI =
+      std::find_if(CallerEdges.begin(), CallerEdges.end(),
+                   [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
+                     return CallerEdge.get() == Edge;
+                   });
+  assert(EI != CallerEdges.end());
+  CallerEdges.erase(EI);
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
+    DenseSet<uint32_t> &ContextIds) {
+  uint8_t BothTypes =
+      (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
+  uint8_t AllocType = (uint8_t)AllocationType::None;
+  for (auto Id : ContextIds) {
+    AllocType |= (uint8_t)ContextIdToAllocationType[Id];
+    // Bail early if alloc type reached both, no further refinement.
+    if (AllocType == BothTypes)
+      return AllocType;
+  }
+  return AllocType;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
+CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
+    CallInfo Call, const FuncTy *F) {
+  assert(!getNodeForAlloc(Call));
+  NodeOwner.push_back(
+      std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
+  ContextNode *AllocNode = NodeOwner.back().get();
+  AllocationCallToContextNodeMap[Call] = AllocNode;
+  NodeToCallingFunc[AllocNode] = F;
+  // Use LastContextId as a uniq id for MIB allocation nodes.
+  AllocNode->OrigStackOrAllocId = LastContextId;
+  // Alloc type should be updated as we add in the MIBs. We should assert
+  // afterwards that it is not still None.
+  AllocNode->AllocTypes = (uint8_t)AllocationType::None;
+
+  return AllocNode;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+template <class NodeT, class IteratorT>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
+    ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
+    CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
+  ContextIdToAllocationType[++LastContextId] = AllocType;
+
+  // Update alloc type and context ids for this MIB.
+  AllocNode->AllocTypes |= (uint8_t)AllocType;
+  AllocNode->ContextIds.insert(LastContextId);
+
+  // Now add or update nodes for each stack id in alloc's context.
+  // Later when processing the stack ids on non-alloc callsites we will adjust
+  // for any inlining in the context.
+  ContextNode *PrevNode = AllocNode;
+  // Look for recursion (direct recursion should have been collapsed by
+  // module summary analysis, here we should just be detecting mutual
+  // recursion). Mark these nodes so we don't try to clone.
+  SmallSet<uint64_t, 8> StackIdSet;
+  // Skip any on the allocation call (inlining).
+  for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
+       ContextIter != StackContext.end(); ++ContextIter) {
+    auto StackId = getStackId(*ContextIter);
+    ContextNode *StackNode = getNodeForStackId(StackId);
+    if (!StackNode) {
+      NodeOwner.push_back(
+          std::make_unique<ContextNode>(/*IsAllocation=*/false));
+      StackNode = NodeOwner.back().get();
+      StackEntryIdToContextNodeMap[StackId] = StackNode;
+      StackNode->OrigStackOrAllocId = StackId;
+    }
+    auto Ins = StackIdSet.insert(StackId);
+    if (!Ins.second)
+      StackNode->Recursive = true;
+    StackNode->ContextIds.insert(LastContextId);
+    StackNode->AllocTypes |= (uint8_t)AllocType;
+    PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
+    PrevNode = StackNode;
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+DenseSet<uint32_t>
+CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
+    const DenseSet<uint32_t> &StackSequenceContextIds,
+    DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
+  DenseSet<uint32_t> NewContextIds;
+  for (auto OldId : StackSequenceContextIds) {
+    NewContextIds.insert(++LastContextId);
+    OldToNewContextIds[OldId].insert(LastContextId);
+    assert(ContextIdToAllocationType.count(OldId));
+    // The new context has the same allocation type as original.
+    ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
+  }
+  return NewContextIds;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
+    propagateDuplicateContextIds(
+        const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
+  // Build a set of duplicated context ids corresponding to the input id set.
+  auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
+    DenseSet<uint32_t> NewIds;
+    for (auto Id : ContextIds)
+      if (auto NewId = OldToNewContextIds.find(Id);
+          NewId != OldToNewContextIds.end())
+        NewIds.insert(NewId->second.begin(), NewId->second.end());
+    return NewIds;
+  };
+
+  // Recursively update context ids sets along caller edges.
+  auto UpdateCallers = [&](ContextNode *Node,
+                           DenseSet<const ContextEdge *> &Visited,
+                           auto &&UpdateCallers) -> void {
+    for (auto Edge : Node->CallerEdges) {
+      auto Inserted = Visited.insert(Edge.get());
+      if (!Inserted.second)
+        continue;
+      ContextNode *NextNode = Edge->Caller;
+      DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
+      // Only need to recursively iterate to NextNode via this caller edge if
+      // it resulted in any added ids to NextNode.
+      if (!NewIdsToAdd.empty()) {
+        Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
+        NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
+        UpdateCallers(NextNode, Visited, UpdateCallers);
+      }
+    }
+  };
+
+  DenseSet<const ContextEdge *> Visited;
+  for (auto &Entry : AllocationCallToContextNodeMap) {
+    auto *Node = Entry.second;
+    // Update ids on the allocation nodes before calling the recursive
+    // update along caller edges, since this simplifies the logic during
+    // that traversal.
+    DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
+    Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
+    UpdateCallers(Node, Visited, UpdateCallers);
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
+    ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
+  // Make a copy of the context ids, since this will be adjusted below as they
+  // are moved.
+  DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
+  auto &OrigEdges =
+      TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
+  // Increment iterator in loop so that we can remove edges as needed.
+  for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
+    auto Edge = *EI;
+    // Remove any matching context ids from Edge, return set that were found and
+    // removed, these are the new edge's context ids. Also update the remaining
+    // (not found ids).
+    DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
+    set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
+                 NotFoundContextIds);
+    RemainingContextIds.swap(NotFoundContextIds);
+    // If no matching context ids for this edge, skip it.
+    if (NewEdgeContextIds.empty()) {
+      ++EI;
+      continue;
+    }
+    if (TowardsCallee) {
+      auto NewEdge = std::make_shared<ContextEdge>(
+          Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
+          NewEdgeContextIds);
+      NewNode->CalleeEdges.push_back(NewEdge);
+      NewEdge->Callee->CallerEdges.push_back(NewEdge);
+    } else {
+      auto NewEdge = std::make_shared<ContextEdge>(
+          NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
+          NewEdgeContextIds);
+      NewNode->CallerEdges.push_back(NewEdge);
+      NewEdge->Caller->CalleeEdges.push_back(NewEdge);
+    }
+    // Remove old edge if context ids empty.
+    if (Edge->getContextIds().empty()) {
+      if (TowardsCallee) {
+        Edge->Callee->eraseCallerEdge(Edge.get());
+        EI = OrigNode->CalleeEdges.erase(EI);
+      } else {
+        Edge->Caller->eraseCalleeEdge(Edge.get());
+        EI = OrigNode->CallerEdges.erase(EI);
+      }
+      continue;
+    }
+    ++EI;
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
+    assignStackNodesPostOrder(ContextNode *Node,
+                              DenseSet<const ContextNode *> &Visited,
+                              DenseMap<uint64_t, std::vector<CallContextInfo>>
+                                  &StackIdToMatchingCalls) {
+  auto Inserted = Visited.insert(Node);
+  if (!Inserted.second)
+    return;
+  // Post order traversal. Iterate over a copy since we may add nodes and
+  // therefore new callers during the recursive call, invalidating any
+  // iterator over the original edge vector. We don't need to process these
+  // new nodes as they were already processed on creation.
+  auto CallerEdges = Node->CallerEdges;
+  for (auto &Edge : CallerEdges) {
+    // Skip any that have been removed during the recursion.
+    if (!Edge)
+      continue;
+    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
+  }
+
+  // If this node's stack id is in the map, update the graph to contain new
+  // nodes representing any inlining at interior callsites. Note we move the
+  // associated context ids over to the new nodes.
+
+  // Ignore this node if it is for an allocation or we didn't record any
+  // stack id lists ending at it.
+  if (Node->IsAllocation ||
+      !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
+    return;
+
+  auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
+  // Handle the simple case first. A single call with a single stack id.
+  // In this case there is no need to create any new context nodes, simply
+  // assign the context node for stack id to this Call.
+  if (Calls.size() == 1) {
+    auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
+    if (Ids.size() == 1) {
+      assert(SavedContextIds.empty());
+      // It should be this Node
+      assert(Node == getNodeForStackId(Ids[0]));
+      if (Node->Recursive)
+        return;
+      Node->setCall(Call);
+      NonAllocationCallToContextNodeMap[Call] = Node;
+      NodeToCallingFunc[Node] = Func;
+      return;
+    }
+  }
+
+  // Find the node for the last stack id, which should be the same
+  // across all calls recorded for this id, and is this node's id.
+  uint64_t LastId = Node->OrigStackOrAllocId;
+  ContextNode *LastNode = getNodeForStackId(LastId);
+  // We should only have kept stack ids that had nodes.
+  assert(LastNode);
+
+  for (unsigned I = 0; I < Calls.size(); I++) {
+    auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
+    // Skip any for which we didn't assign any ids, these don't get a node in
+    // the graph.
+    if (SavedContextIds.empty())
+      continue;
+
+    assert(LastId == Ids.back());
+
+    ContextNode *FirstNode = getNodeForStackId(Ids[0]);
+    assert(FirstNode);
+
+    // Recompute the context ids for this stack id sequence (the
+    // intersection of the context ids of the corresponding nodes).
+    // Start with the ids we saved in the map for this call, which could be
+    // duplicated context ids. We have to recompute as we might have overlap
+    // overlap between the saved context ids for different last nodes, and
+    // removed them already during the post order traversal.
+    set_intersect(SavedContextIds, FirstNode->ContextIds);
+    ContextNode *PrevNode = nullptr;
+    for (auto Id : Ids) {
+      ContextNode *CurNode = getNodeForStackId(Id);
+      // We should only have kept stack ids that had nodes and weren't
+      // recursive.
+      assert(CurNode);
+      assert(!CurNode->Recursive);
+      if (!PrevNode) {
+        PrevNode = CurNode;
+        continue;
+      }
+      auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
+      if (!Edge) {
+        SavedContextIds.clear();
+        break;
+      }
+      PrevNode = CurNode;
+      set_intersect(SavedContextIds, Edge->getContextIds());
+
+      // If we now have no context ids for clone, skip this call.
+      if (SavedContextIds.empty())
+        break;
+    }
+    if (SavedContextIds.empty())
+      continue;
+
+    // Create new context node.
+    NodeOwner.push_back(
+        std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
+    ContextNode *NewNode = NodeOwner.back().get();
+    NodeToCallingFunc[NewNode] = Func;
+    NonAllocationCallToContextNodeMap[Call] = NewNode;
+    NewNode->ContextIds = SavedContextIds;
+    NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
+
+    // Connect to callees of innermost stack frame in inlined call chain.
+    // This updates context ids for FirstNode's callee's to reflect those
+    // moved to NewNode.
+    connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
+
+    // Connect to callers of outermost stack frame in inlined call chain.
+    // This updates context ids for FirstNode's caller's to reflect those
+    // moved to NewNode.
+    connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
+
+    // Now we need to remove context ids from edges/nodes between First and
+    // Last Node.
+    PrevNode = nullptr;
+    for (auto Id : Ids) {
+      ContextNode *CurNode = getNodeForStackId(Id);
+      // We should only have kept stack ids that had nodes.
+      assert(CurNode);
+
+      // Remove the context ids moved to NewNode from CurNode, and the
+      // edge from the prior node.
+      set_subtract(CurNode->ContextIds, NewNode->ContextIds);
+      if (PrevNode) {
+        auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
+        assert(PrevEdge);
+        set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
+        if (PrevEdge->getContextIds().empty()) {
+          PrevNode->eraseCallerEdge(PrevEdge);
+          CurNode->eraseCalleeEdge(PrevEdge);
+        }
+      }
+      PrevNode = CurNode;
+    }
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
+  // Map of stack id to all calls with that as the last (outermost caller)
+  // callsite id that has a context node (some might not due to pruning
+  // performed during matching of the allocation profile contexts).
+  // The CallContextInfo contains the Call and a list of its stack ids with
+  // ContextNodes, the function containing Call, and the set of context ids
+  // the analysis will eventually identify for use in any new node created
+  // for that callsite.
+  DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
+  for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
+    for (auto &Call : CallsWithMetadata) {
+      // Ignore allocations, already handled.
+      if (AllocationCallToContextNodeMap.count(Call))
+        continue;
+      auto StackIdsWithContextNodes =
+          getStackIdsWithContextNodesForCall(Call.call());
+      // If there were no nodes created for MIBs on allocs (maybe this was in
+      // the unambiguous part of the MIB stack that was pruned), ignore.
+      if (StackIdsWithContextNodes.empty())
+        continue;
+      // Otherwise, record this Call along with the list of ids for the last
+      // (outermost caller) stack id with a node.
+      StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
+          {Call.call(), StackIdsWithContextNodes, Func, {}});
+    }
+  }
+
+  // First make a pass through all stack ids that correspond to a call,
+  // as identified in the above loop. Compute the context ids corresponding to
+  // each of these calls when they correspond to multiple stack ids due to
+  // due to inlining. Perform any duplication of context ids required when
+  // there is more than one call with the same stack ids. Their (possibly newly
+  // duplicated) context ids are saved in the StackIdToMatchingCalls map.
+  DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
+  for (auto &It : StackIdToMatchingCalls) {
+    auto &Calls = It.getSecond();
+    // Skip single calls with a single stack id. These don't need a new node.
+    if (Calls.size() == 1) {
+      auto &Ids = std::get<1>(Calls[0]);
+      if (Ids.size() == 1)
+        continue;
+    }
+    // In order to do the best and maximal matching of inlined calls to context
+    // node sequences we will sort the vectors of stack ids in descending order
+    // of length, and within each length, lexicographically by stack id. The
+    // latter is so that we can specially handle calls that have identical stack
+    // id sequences (either due to cloning or artificially because of the MIB
+    // context pruning).
+    std::sort(Calls.begin(), Calls.end(),
+              [](const CallContextInfo &A, const CallContextInfo &B) {
+                auto &IdsA = std::get<1>(A);
+                auto &IdsB = std::get<1>(B);
+                return IdsA.size() > IdsB.size() ||
+                       (IdsA.size() == IdsB.size() && IdsA < IdsB);
+              });
+
+    // Find the node for the last stack id, which should be the same
+    // across all calls recorded for this id, and is the id for this
+    // entry in the StackIdToMatchingCalls map.
+    uint64_t LastId = It.getFirst();
+    ContextNode *LastNode = getNodeForStackId(LastId);
+    // We should only have kept stack ids that had nodes.
+    assert(LastNode);
+
+    if (LastNode->Recursive)
+      continue;
+
+    // Initialize the context ids with the last node's. We will subsequently
+    // refine the context ids by computing the intersection along all edges.
+    DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
+    assert(!LastNodeContextIds.empty());
+
+    for (unsigned I = 0; I < Calls.size(); I++) {
+      auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
+      assert(SavedContextIds.empty());
+      assert(LastId == Ids.back());
+
+      // First compute the context ids for this stack id sequence (the
+      // intersection of the context ids of the corresponding nodes).
+      // Start with the remaining saved ids for the last node.
+      assert(!LastNodeContextIds.empty());
+      DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
+
+      ContextNode *PrevNode = LastNode;
+      ContextNode *CurNode = LastNode;
+      bool Skip = false;
+
+      // Iterate backwards through the stack Ids, starting after the last Id
+      // in the list, which was handled once outside for all Calls.
+      for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
+        auto Id = *IdIter;
+        CurNode = getNodeForStackId(Id);
+        // We should only have kept stack ids that had nodes.
+        assert(CurNode);
+
+        if (CurNode->Recursive) {
+          Skip = true;
+          break;
+        }
+
+        auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
+        // If there is no edge then the nodes belong to different MIB contexts,
+        // and we should skip this inlined context sequence. For example, this
+        // particular inlined context may include stack ids A->B, and we may
+        // indeed have nodes for both A and B, but it is possible that they were
+        // never profiled in sequence in a single MIB for any allocation (i.e.
+        // we might have profiled an allocation that involves the callsite A,
+        // but through a different one of its callee callsites, and we might
+        // have profiled an allocation that involves callsite B, but reached
+        // from a different caller callsite).
+        if (!Edge) {
+          Skip = true;
+          break;
+        }
+        PrevNode = CurNode;
+
+        // Update the context ids, which is the intersection of the ids along
+        // all edges in the sequence.
+        set_intersect(StackSequenceContextIds, Edge->getContextIds());
+
+        // If we now have no context ids for clone, skip this call.
+        if (StackSequenceContextIds.empty()) {
+          Skip = true;
+          break;
+        }
+      }
+      if (Skip)
+        continue;
+
+      // If some of this call's stack ids did not have corresponding nodes (due
+      // to pruning), don't include any context ids for contexts that extend
+      // beyond these nodes. Otherwise we would be matching part of unrelated /
+      // not fully matching stack contexts. To do this, subtract any context ids
+      // found in caller nodes of the last node found above.
+      if (Ids.back() != getLastStackId(Call)) {
+        for (auto PE : LastNode->CallerEdges) {
+          set_subtract(StackSequenceContextIds, PE->getContextIds());
+          if (StackSequenceContextIds.empty())
+            break;
+        }
+        // If we now have no context ids for clone, skip this call.
+        if (StackSequenceContextIds.empty())
+          continue;
+      }
+
+      // Check if the next set of stack ids is the same (since the Calls vector
+      // of tuples is sorted by the stack ids we can just look at the next one).
+      bool DuplicateContextIds = false;
+      if (I + 1 < Calls.size()) {
+        auto NextIds = std::get<1>(Calls[I + 1]);
+        DuplicateContextIds = Ids == NextIds;
+      }
+
+      // If we don't have duplicate context ids, then we can assign all the
+      // context ids computed for the original node sequence to this call.
+      // If there are duplicate calls with the same stack ids then we synthesize
+      // new context ids that are duplicates of the originals. These are
+      // assigned to SavedContextIds, which is a reference into the map entry
+      // for this call, allowing us to access these ids later on.
+      OldToNewContextIds.reserve(OldToNewContextIds.size() +
+                                 StackSequenceContextIds.size());
+      SavedContextIds =
+          DuplicateContextIds
+              ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
+              : StackSequenceContextIds;
+      assert(!SavedContextIds.empty());
+
+      if (!DuplicateContextIds) {
+        // Update saved last node's context ids to remove those that are
+        // assigned to other calls, so that it is ready for the next call at
+        // this stack id.
+        set_subtract(LastNodeContextIds, StackSequenceContextIds);
+        if (LastNodeContextIds.empty())
+          break;
+      }
+    }
+  }
+
+  // Propagate the duplicate context ids over the graph.
+  propagateDuplicateContextIds(OldToNewContextIds);
+
+  if (VerifyCCG)
+    check();
+
+  // Now perform a post-order traversal over the graph, starting with the
+  // allocation nodes, essentially processing nodes from callers to callees.
+  // For any that contains an id in the map, update the graph to contain new
+  // nodes representing any inlining at interior callsites. Note we move the
+  // associated context ids over to the new nodes.
+  DenseSet<const ContextNode *> Visited;
+  for (auto &Entry : AllocationCallToContextNodeMap)
+    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
+}
+
+uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
+  CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
+      Call->getMetadata(LLVMContext::MD_callsite));
+  return CallsiteContext.back();
+}
+
+std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
+                                                 const Instruction *Call,
+                                                 unsigned CloneNo) const {
+  return (Twine(Call->getFunction()->getName()) + " -> " +
+          cast<CallBase>(Call)->getCalledFunction()->getName())
+      .str();
+}
+
+std::vector<uint64_t>
+ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
+    Instruction *Call) {
+  CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
+      Call->getMetadata(LLVMContext::MD_callsite));
+  return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
+      CallsiteContext);
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+template <class NodeT, class IteratorT>
+std::vector<uint64_t>
+CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
+    CallStack<NodeT, IteratorT> &CallsiteContext) {
+  std::vector<uint64_t> StackIds;
+  for (auto IdOrIndex : CallsiteContext) {
+    auto StackId = getStackId(IdOrIndex);
+    ContextNode *Node = getNodeForStackId(StackId);
+    if (!Node)
+      break;
+    StackIds.push_back(StackId);
+  }
+  return StackIds;
+}
+
+ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
+  for (auto &F : M) {
+    std::vector<CallInfo> CallsWithMetadata;
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (!isa<CallBase>(I))
+          continue;
+        if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
+          CallsWithMetadata.push_back(&I);
+          auto *AllocNode = addAllocNode(&I, &F);
+          auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
+          assert(CallsiteMD);
+          CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
+          // Add all of the MIBs and their stack nodes.
+          for (auto &MDOp : MemProfMD->operands()) {
+            auto *MIBMD = cast<const MDNode>(MDOp);
+            MDNode *StackNode = getMIBStackNode(MIBMD);
+            assert(StackNode);
+            CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
+            addStackNodesForMIB<MDNode, MDNode::op_iterator>(
+                AllocNode, StackContext, CallsiteContext,
+                getMIBAllocType(MIBMD));
+          }
+          assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
+          // Memprof and callsite metadata on memory allocations no longer
+          // needed.
+          I.setMetadata(LLVMContext::MD_memprof, nullptr);
+          I.setMetadata(LLVMContext::MD_callsite, nullptr);
+        }
+        // For callsite metadata, add to list for this function for later use.
+        else if (I.getMetadata(LLVMContext::MD_callsite))
+          CallsWithMetadata.push_back(&I);
+      }
+    }
+    if (!CallsWithMetadata.empty())
+      FuncToCallsWithMetadata.push_back({&F, CallsWithMetadata});
+  }
+
+  if (DumpCCG) {
+    dbgs() << "CCG before updating call stack chains:\n";
+    dbgs() << *this;
+  }
+
+  if (ExportToDot)
+    exportToDot("prestackupdate");
+
+  updateStackNodes();
+
+  handleCallsitesWithMultipleTargets();
+
+  // Strip off remaining callsite metadata, no longer needed.
+  for (auto &FuncEntry : FuncToCallsWithMetadata)
+    for (auto &Call : FuncEntry.second)
+      Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy,
+                          CallTy>::handleCallsitesWithMultipleTargets() {
+  // Look for and workaround callsites that call multiple functions.
+  // This can happen for indirect calls, which needs better handling, and in
+  // more rare cases (e.g. macro expansion).
+  // TODO: To fix this for indirect calls we will want to perform speculative
+  // devirtualization using either the normal PGO info with ICP, or using the
+  // information in the profiled MemProf contexts. We can do this prior to
+  // this transformation for regular LTO, and for ThinLTO we can simulate that
+  // effect in the summary and perform the actual speculative devirtualization
+  // while cloning in the ThinLTO backend.
+  for (auto Entry = NonAllocationCallToContextNodeMap.begin();
+       Entry != NonAllocationCallToContextNodeMap.end();) {
+    auto *Node = Entry->second;
+    assert(Node->Clones.empty());
+    // Check all node callees and see if in the same function.
+    bool Removed = false;
+    auto Call = Node->Call.call();
+    for (auto &Edge : Node->CalleeEdges) {
+      if (!Edge->Callee->hasCall())
+        continue;
+      assert(NodeToCallingFunc.count(Edge->Callee));
+      // Check if the called function matches that of the callee node.
+      if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee]))
+        continue;
+      // Work around by setting Node to have a null call, so it gets
+      // skipped during cloning. Otherwise assignFunctions will assert
+      // because its data structures are not designed to handle this case.
+      Entry = NonAllocationCallToContextNodeMap.erase(Entry);
+      Node->setCall(CallInfo());
+      Removed = true;
+      break;
+    }
+    if (!Removed)
+      Entry++;
+  }
+}
+
+uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
+  // In the Module (IR) case this is already the Id.
+  return IdOrIndex;
+}
+
+bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
+                                                   const Function *Func) {
+  auto *CB = dyn_cast<CallBase>(Call);
+  if (!CB->getCalledOperand())
+    return false;
+  auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
+  auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
+  if (CalleeFunc == Func)
+    return true;
+  auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
+  return Alias && Alias->getAliasee() == Func;
+}
+
+static std::string getAllocTypeString(uint8_t AllocTypes) {
+  if (!AllocTypes)
+    return "None";
+  std::string Str;
+  if (AllocTypes & (uint8_t)AllocationType::NotCold)
+    Str += "NotCold";
+  if (AllocTypes & (uint8_t)AllocationType::Cold)
+    Str += "Cold";
+  return Str;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
+    const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
+    raw_ostream &OS) const {
+  OS << "Node " << this << "\n";
+  OS << "\t";
+  printCall(OS);
+  if (Recursive)
+    OS << " (recursive)";
+  OS << "\n";
+  OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
+  OS << "\tContextIds:";
+  std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
+  std::sort(SortedIds.begin(), SortedIds.end());
+  for (auto Id : SortedIds)
+    OS << " " << Id;
+  OS << "\n";
+  OS << "\tCalleeEdges:\n";
+  for (auto &Edge : CalleeEdges)
+    OS << "\t\t" << *Edge << "\n";
+  OS << "\tCallerEdges:\n";
+  for (auto &Edge : CallerEdges)
+    OS << "\t\t" << *Edge << "\n";
+  if (!Clones.empty()) {
+    OS << "\tClones: ";
+    FieldSeparator FS;
+    for (auto *Clone : Clones)
+      OS << FS << Clone;
+    OS << "\n";
+  } else if (CloneOf) {
+    OS << "\tClone of " << CloneOf << "\n";
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
+    const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
+    raw_ostream &OS) const {
+  OS << "Edge from Callee " << Callee << " to Caller: " << Caller
+     << " AllocTypes: " << getAllocTypeString(AllocTypes);
+  OS << " ContextIds:";
+  std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
+  std::sort(SortedIds.begin(), SortedIds.end());
+  for (auto Id : SortedIds)
+    OS << " " << Id;
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
+  print(dbgs());
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
+    raw_ostream &OS) const {
+  OS << "Callsite Context Graph:\n";
+  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
+  for (const auto Node : nodes<GraphType>(this)) {
+    if (Node->isRemoved())
+      continue;
+    Node->print(OS);
+    OS << "\n";
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+static void checkEdge(
+    const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
+  // Confirm that alloc type is not None and that we have at least one context
+  // id.
+  assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
+  assert(!Edge->ContextIds.empty());
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node) {
+  if (Node->isRemoved())
+    return;
+  // Node's context ids should be the union of both its callee and caller edge
+  // context ids.
+  if (Node->CallerEdges.size()) {
+    auto EI = Node->CallerEdges.begin();
+    auto &FirstEdge = *EI;
+    EI++;
+    DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
+    for (; EI != Node->CallerEdges.end(); EI++) {
+      const auto &Edge = *EI;
+      set_union(CallerEdgeContextIds, Edge->ContextIds);
+    }
+    // Node can have more context ids than callers if some contexts terminate at
+    // node and some are longer.
+    assert(Node->ContextIds == CallerEdgeContextIds ||
+           set_is_subset(CallerEdgeContextIds, Node->ContextIds));
+  }
+  if (Node->CalleeEdges.size()) {
+    auto EI = Node->CalleeEdges.begin();
+    auto &FirstEdge = *EI;
+    EI++;
+    DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
+    for (; EI != Node->CalleeEdges.end(); EI++) {
+      const auto &Edge = *EI;
+      set_union(CalleeEdgeContextIds, Edge->ContextIds);
+    }
+    assert(Node->ContextIds == CalleeEdgeContextIds);
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
+  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
+  for (const auto Node : nodes<GraphType>(this)) {
+    checkNode<DerivedCCG, FuncTy, CallTy>(Node);
+    for (auto &Edge : Node->CallerEdges)
+      checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
+  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
+  using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
+
+  using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
+  static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
+
+  using nodes_iterator =
+      mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
+                      decltype(&getNode)>;
+
+  static nodes_iterator nodes_begin(GraphType G) {
+    return nodes_iterator(G->NodeOwner.begin(), &getNode);
+  }
+
+  static nodes_iterator nodes_end(GraphType G) {
+    return nodes_iterator(G->NodeOwner.end(), &getNode);
+  }
+
+  static NodeRef getEntryNode(GraphType G) {
+    return G->NodeOwner.begin()->get();
+  }
+
+  using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
+  static const ContextNode<DerivedCCG, FuncTy, CallTy> *
+  GetCallee(const EdgePtrTy &P) {
+    return P->Callee;
+  }
+
+  using ChildIteratorType =
+      mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
+                          DerivedCCG, FuncTy, CallTy>>>::const_iterator,
+                      decltype(&GetCallee)>;
+
+  static ChildIteratorType child_begin(NodeRef N) {
+    return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
+  }
+
+  static ChildIteratorType child_end(NodeRef N) {
+    return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
+  }
+};
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
+    : public DefaultDOTGraphTraits {
+  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
+
+  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
+  using GTraits = GraphTraits<GraphType>;
+  using NodeRef = typename GTraits::NodeRef;
+  using ChildIteratorType = typename GTraits::ChildIteratorType;
+
+  static std::string getNodeLabel(NodeRef Node, GraphType G) {
+    std::string LabelString =
+        (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
+         Twine(Node->OrigStackOrAllocId))
+            .str();
+    LabelString += "\n";
+    if (Node->hasCall()) {
+      auto Func = G->NodeToCallingFunc.find(Node);
+      assert(Func != G->NodeToCallingFunc.end());
+      LabelString +=
+          G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
+    } else {
+      LabelString += "null call";
+      if (Node->Recursive)
+        LabelString += " (recursive)";
+      else
+        LabelString += " (external)";
+    }
+    return LabelString;
+  }
+
+  static std::string getNodeAttributes(NodeRef Node, GraphType) {
+    std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
+                                   getContextIds(Node->ContextIds) + "\"")
+                                      .str();
+    AttributeString +=
+        (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
+    AttributeString += ",style=\"filled\"";
+    if (Node->CloneOf) {
+      AttributeString += ",color=\"blue\"";
+      AttributeString += ",style=\"filled,bold,dashed\"";
+    } else
+      AttributeString += ",style=\"filled\"";
+    return AttributeString;
+  }
+
+  static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
+                                       GraphType) {
+    auto &Edge = *(ChildIter.getCurrent());
+    return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
+            Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
+        .str();
+  }
+
+  // Since the NodeOwners list includes nodes that are no longer connected to
+  // the graph, skip them here.
+  static bool isNodeHidden(NodeRef Node, GraphType) {
+    return Node->isRemoved();
+  }
+
+private:
+  static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
+    std::string IdString = "ContextIds:";
+    if (ContextIds.size() < 100) {
+      std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
+      std::sort(SortedIds.begin(), SortedIds.end());
+      for (auto Id : SortedIds)
+        IdString += (" " + Twine(Id)).str();
+    } else {
+      IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
+    }
+    return IdString;
+  }
+
+  static std::string getColor(uint8_t AllocTypes) {
+    if (AllocTypes == (uint8_t)AllocationType::NotCold)
+      // Color "brown1" actually looks like a lighter red.
+      return "brown1";
+    if (AllocTypes == (uint8_t)AllocationType::Cold)
+      return "cyan";
+    if (AllocTypes ==
+        ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
+      // Lighter purple.
+      return "mediumorchid1";
+    return "gray";
+  }
+
+  static std::string getNodeId(NodeRef Node) {
+    std::stringstream SStream;
+    SStream << std::hex << "N0x" << (unsigned long long)Node;
+    std::string Result = SStream.str();
+    return Result;
+  }
+};
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
+    std::string Label) const {
+  WriteGraph(this, "", false, Label,
+             DotFilePathPrefix + "ccg." + Label + ".dot");
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
+  if (DumpCCG) {
+    dbgs() << "CCG before cloning:\n";
+    dbgs() << *this;
+  }
+  if (ExportToDot)
+    exportToDot("postbuild");
+
+  if (VerifyCCG) {
+    check();
+  }
+
+  return false;
+}
+
+bool MemProfContextDisambiguation::processModule(Module &M) {
+  bool Changed = false;
+
+  ModuleCallsiteContextGraph CCG(M);
+  Changed = CCG.process();
+
+  return Changed;
+}
+
+PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  if (!processModule(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/test/ThinLTO/X86/memprof-summary.ll b/llvm/test/ThinLTO/X86/memprof-summary.ll
deleted file mode 100644
index 597cd44c030e7..0000000000000
--- a/llvm/test/ThinLTO/X86/memprof-summary.ll
+++ /dev/null
@@ -1,184 +0,0 @@
-;; Check memprof summaries (per module, combined index, and distributed indexes)
-
-; RUN: split-file %s %t
-; RUN: opt -module-summary %t/a.ll -o %ta.bc
-; RUN: opt -module-summary %t/b.ll -o %tb.bc
-
-; RUN: llvm-dis -o - %ta.bc | FileCheck %s --check-prefix=PRELINKDISA
-; PRELINKDISA: gv: (name: "main", {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438)))))) ; guid = 15822663052811949562
-
-; RUN: llvm-dis -o - %tb.bc | FileCheck %s --check-prefix=PRELINKDISB
-; PRELINKDISB: ^[[PLBAR:[0-9]+]] = gv: (name: "_Z3barv", {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) ; guid = 4555904644815367798
-; PRELINKDISB: ^[[PLFOO:[0-9]+]] = gv: (name: "_Z3foov", {{.*}} callsites: ((callee: ^[[PLBAZ:[0-9]+]], clones: (0), stackIds: (2732490490862098848)))))) ; guid = 9191153033785521275
-; PRELINKDISB: ^[[PLBAZ]] = gv: (name: "_Z3bazv", {{.*}} callsites: ((callee: ^[[PLBAR]], clones: (0), stackIds: (12481870273128938184)))))) ; guid = 15176620447596392000
-
-; RUN: llvm-bcanalyzer -dump %ta.bc | FileCheck %s --check-prefix=PRELINKBCANA
-; PRELINKBCANA: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178/>
-
-; RUN: llvm-bcanalyzer -dump %tb.bc | FileCheck %s --check-prefix=PRELINKBCANB
-; PRELINKBCANB: <STACK_IDS abbrevid=4 op0=-5964873800580613432 op1=2732490490862098848 op2=8632435727821051414 op3=-3421689549917153178/>
-
-; RUN: llvm-lto2 run %ta.bc %tb.bc -o %t -save-temps \
-; RUN:     -thinlto-distributed-indexes \
-; RUN:     -r=%ta.bc,main,plx \
-; RUN:     -r=%ta.bc,_Z3foov, \
-; RUN:     -r=%ta.bc,free, \
-; RUN:     -r=%ta.bc,sleep, \
-; RUN:     -r=%tb.bc,_Z3foov,pl \
-; RUN:     -r=%tb.bc,_Znam, \
-; RUN:     -r=%tb.bc,_Z3bazv,pl
-
-; RUN: llvm-dis -o - %t.index.bc | FileCheck %s --check-prefix=COMBINEDDIS
-; COMBINEDDIS: ^[[COMBBAR:[0-9]+]] = gv: (guid: 4555904644815367798, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438))))))))
-; COMBINEDDIS: ^[[COMBFOO:[0-9]+]] = gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^[[COMBBAZ:[0-9]+]], clones: (0), stackIds: (2732490490862098848))))))
-; COMBINEDDIS: ^[[COMBBAZ]] = gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^[[COMBBAR]], clones: (0), stackIds: (12481870273128938184))))))
-; COMBINEDDIS: ^[[COMBMAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[COMBFOO]], clones: (0), stackIds: (8632435727821051414)), (callee: ^[[COMBFOO]], clones: (0), stackIds: (15025054523792398438))))))
-
-; RUN: llvm-bcanalyzer -dump %t.index.bc | FileCheck %s --check-prefix=COMBINEDBCAN
-; COMBINEDBCAN: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=-5964873800580613432 op3=2732490490862098848/>
-
-; RUN: llvm-dis -o - %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISA
-; DISTRIBUTEDDISA: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: null, clones: (0), stackIds: (2732490490862098848))))))
-; DISTRIBUTEDDISA: gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438))))))
-
-; RUN: llvm-dis -o - %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISB
-; DISTRIBUTEDDISB: ^[[DISTRBAR:[0-9]+]] = gv: (guid: 4555904644815367798, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438))))))))
-; DISTRIBUTEDDISB: ^[[DISTRFOO:[0-9]+]] = gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^[[DISTRBAZ:[0-9]+]], clones: (0), stackIds: (2732490490862098848))))))
-; DISTRIBUTEDDISB: ^[[DISTRBAZ]] = gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^[[DISTRBAR]], clones: (0), stackIds: (12481870273128938184))))))
-
-; RUN: llvm-bcanalyzer -dump %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANA
-; DISTRIBUTEDBCANA: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=2732490490862098848/>
-
-; RUN: llvm-bcanalyzer -dump %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANB
-; DISTRIBUTEDBCANB: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=-5964873800580613432 op3=2732490490862098848/>
-
-;--- a.ll
-; ModuleID = 'a.cc'
-source_filename = "a.cc"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: mustprogress norecurse uwtable
-define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #0 !dbg !39 {
-entry:
-  %call = call noundef ptr @_Z3foov(), !dbg !42, !callsite !43
-  %call1 = call noundef ptr @_Z3foov(), !dbg !44, !callsite !45
-  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call, i8 0, i64 10, i1 false), !dbg !46
-  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call1, i8 0, i64 10, i1 false), !dbg !47
-  call void @free(ptr noundef %call) #4, !dbg !48
-  %call2 = call i32 @sleep(i32 noundef 10), !dbg !49
-  call void @free(ptr noundef %call1) #4, !dbg !50
-  ret i32 0, !dbg !51
-}
-
-declare !dbg !52 noundef ptr @_Z3foov() local_unnamed_addr #1
-
-; Function Attrs: argmemonly mustprogress nocallback nofree nounwind willreturn writeonly
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
-
-; Function Attrs: inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free")
-declare void @free(ptr allocptr nocapture noundef) local_unnamed_addr #3
-
-declare !dbg !53 i32 @sleep(i32 noundef) local_unnamed_addr #1
-
-attributes #0 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #2 = { argmemonly mustprogress nocallback nofree nounwind willreturn writeonly }
-attributes #3 = { inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free") "alloc-family"="malloc" "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #4 = { nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git@github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
-!1 = !DIFile(filename: "a.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ebabd56909271a1d4a7cac81c10624d5")
-!2 = !{i32 7, !"Dwarf Version", i32 5}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
-!4 = !{i32 1, !"wchar_size", i32 4}
-!5 = !{i32 8, !"PIC Level", i32 2}
-!6 = !{i32 7, !"PIE Level", i32 2}
-!7 = !{i32 7, !"uwtable", i32 2}
-!8 = !{i32 7, !"frame-pointer", i32 2}
-!39 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
-!40 = !DISubroutineType(types: !41)
-!41 = !{}
-!42 = !DILocation(line: 6, column: 13, scope: !39)
-!43 = !{i64 8632435727821051414}
-!44 = !DILocation(line: 7, column: 13, scope: !39)
-!45 = !{i64 -3421689549917153178}
-!46 = !DILocation(line: 8, column: 3, scope: !39)
-!47 = !DILocation(line: 9, column: 3, scope: !39)
-!48 = !DILocation(line: 10, column: 3, scope: !39)
-!49 = !DILocation(line: 11, column: 3, scope: !39)
-!50 = !DILocation(line: 12, column: 3, scope: !39)
-!51 = !DILocation(line: 13, column: 3, scope: !39)
-!52 = !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41)
-!53 = !DISubprogram(name: "sleep", scope: !54, file: !54, line: 453, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41)
-!54 = !DIFile(filename: "include/unistd.h", directory: "/usr", checksumkind: CSK_MD5, checksum: "ee8f41a17f563f029d0e930ad871815a")
-
-;--- b.ll
-; ModuleID = 'b.cc'
-source_filename = "b.cc"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: mustprogress noinline uwtable
-define internal noalias noundef nonnull ptr @_Z3barv() local_unnamed_addr #0 !dbg !39 {
-entry:
-  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #2, !dbg !42, !memprof !43, !callsite !48
-  ret ptr %call, !dbg !49
-}
-
-; Function Attrs: nobuiltin allocsize(0)
-declare noundef nonnull ptr @_Znam(i64 noundef) local_unnamed_addr #1
-
-; Function Attrs: mustprogress noinline uwtable
-define dso_local noalias noundef nonnull ptr @_Z3bazv() local_unnamed_addr #0 !dbg !50 {
-entry:
-  %call = call noundef ptr @_Z3barv(), !dbg !51, !callsite !52
-  ret ptr %call, !dbg !53
-}
-
-; Function Attrs: mustprogress uwtable
-define dso_local noalias noundef nonnull ptr @_Z3foov() local_unnamed_addr #3 !dbg !54 {
-entry:
-  %call = call noundef ptr @_Z3bazv(), !dbg !55, !callsite !56
-  ret ptr %call, !dbg !57
-}
-
-attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #2 = { builtin allocsize(0) }
-attributes #3 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git@github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
-!1 = !DIFile(filename: "b.cc", directory: ".", checksumkind: CSK_MD5, checksum: "335f81d275af57725cfc9ffc7be49bc2")
-!2 = !{i32 7, !"Dwarf Version", i32 5}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
-!4 = !{i32 1, !"wchar_size", i32 4}
-!5 = !{i32 8, !"PIC Level", i32 2}
-!6 = !{i32 7, !"PIE Level", i32 2}
-!7 = !{i32 7, !"uwtable", i32 2}
-!8 = !{i32 7, !"frame-pointer", i32 2}
-!39 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 1, type: !40, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
-!40 = !DISubroutineType(types: !41)
-!41 = !{}
-!42 = !DILocation(line: 2, column: 10, scope: !39)
-!43 = !{!44, !46}
-!44 = !{!45, !"notcold"}
-!45 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!46 = !{!47, !"cold"}
-!47 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
-!48 = !{i64 9086428284934609951}
-!49 = !DILocation(line: 2, column: 3, scope: !39)
-!50 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
-!51 = !DILocation(line: 6, column: 10, scope: !50)
-!52 = !{i64 -5964873800580613432}
-!53 = !DILocation(line: 6, column: 3, scope: !50)
-!54 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 9, type: !40, scopeLine: 9, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
-!55 = !DILocation(line: 10, column: 10, scope: !54)
-!56 = !{i64 2732490490862098848}
-!57 = !DILocation(line: 10, column: 3, scope: !54)
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
new file mode 100644
index 0000000000000..539d88a815ed1
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -0,0 +1,158 @@
+;; Test callsite context graph generation for simple call graph with
+;; two memprof contexts and no inlining.
+;;
+;; Original code looks like:
+;;
+;; char *bar() {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() #0 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !callsite !0
+  %call1 = call noundef ptr @_Z3foov(), !callsite !1
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
+
+; Function Attrs: nobuiltin
+declare void @_ZdaPv() #2
+
+define internal ptr @_Z3barv() #3 {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !2, !callsite !7
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() #4 {
+entry:
+  %call = call noundef ptr @_Z3barv(), !callsite !8
+  ret ptr null
+}
+
+; Function Attrs: noinline
+define internal ptr @_Z3foov() #5 {
+entry:
+  %call = call noundef ptr @_Z3bazv(), !callsite !9
+  ret ptr null
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 1, 0 }
+
+attributes #0 = { "tune-cpu"="generic" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #2 = { nobuiltin }
+attributes #3 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #4 = { "stack-protector-buffer-size"="8" }
+attributes #5 = { noinline }
+attributes #6 = { builtin }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!5 = !{!6, !"cold"}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 -5964873800580613432}
+!9 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[BAZ]]
+; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[FOO]]
+; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOT: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
new file mode 100644
index 0000000000000..c5ed97f182a98
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
@@ -0,0 +1,232 @@
+;; Test callsite context graph generation for call graph with with MIBs
+;; that have pruned contexts that partially match multiple inlined
+;; callsite contexts, requiring duplication of context ids and nodes
+;; while matching callsite nodes onto the graph.
+;;
+;; Original code looks like:
+;;
+;; char *D() {
+;;   return new char[10];
+;; }
+;;
+;; char *F() {
+;;   return D();
+;; }
+;;
+;; char *C() {
+;;   return D();
+;; }
+;;
+;; char *B() {
+;;   return C();
+;; }
+;;
+;; char *E() {
+;;   return C();
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = B(); // cold
+;;   char *y = E(); // cold
+;;   char *z = F(); // default
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   delete[] z;
+;;   sleep(10);
+;;   delete[] x;
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of C into both B and E.
+;; Since both allocation contexts via C are cold, the matched memprof
+;; metadata has the context pruned above C's callsite. This requires
+;; matching the stack node for C to callsites where it was inlined (i.e.
+;; the callsites in B and E that have callsite metadata that includes C's).
+;; It also requires duplication of that node in the graph as well as the
+;; duplication of the context ids along that path through the graph,
+;; so that we can represent the duplicated (via inlining) C callsite.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal ptr @_Z1Dv() {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !5
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z1Fv() #0 {
+entry:
+  %call = call noundef ptr @_Z1Dv(), !callsite !6
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal ptr @_Z1Cv() #1 {
+entry:
+  %call = call noundef ptr @_Z1Dv(), !callsite !7
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal ptr @_Z1Bv() #1 {
+entry:
+  %call.i = call noundef ptr @_Z1Dv(), !callsite !8
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal ptr @_Z1Ev() #1 {
+entry:
+  %call.i = call noundef ptr @_Z1Dv(), !callsite !9
+  ret ptr null
+}
+
+; Function Attrs: noinline
+declare i32 @main() #2
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3
+
+; Function Attrs: nounwind
+declare void @_ZdaPv() #4
+
+declare i32 @sleep() #5
+
+attributes #0 = { "disable-tail-calls"="true" }
+attributes #1 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { noinline }
+attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #4 = { nounwind }
+attributes #5 = { "no-trapping-math"="true" }
+attributes #6 = { builtin }
+
+!0 = !{!1, !3}
+!1 = !{!2, !"cold"}
+!2 = !{i64 6541423618768552252, i64 -6270142974039008131}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 6541423618768552252, i64 -4903163940066524832}
+!5 = !{i64 6541423618768552252}
+!6 = !{i64 -4903163940066524832}
+!7 = !{i64 -6270142974039008131}
+!8 = !{i64 -6270142974039008131, i64 -184525619819294889}
+!9 = !{i64 -6270142974039008131, i64 1905834578520680781}
+
+
+;; After adding only the alloc node memprof metadata, we only have 2 contexts.
+
+; DUMP: CCG before updating call stack chains:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+
+; DUMP: Node [[C]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[F]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+;; After updating for callsite metadata, we should have generated context ids 3 and 4,
+;; along with 2 new nodes for those callsites. All have the same allocation type
+;; behavior as the original C node.
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3
+; DUMP: 		Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+
+; DUMP: Node [[F]]
+; DUMP: 	  %call = call noundef ptr @_Z1Dv()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[C2]]
+; DUMP: 	  %call = call noundef ptr @_Z1Dv()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: 	  %call.i = call noundef ptr @_Z1Dv()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[E]]
+; DUMP: 	  %call.i = call noundef ptr @_Z1Dv()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+
+; DOTPRE: digraph "prestackupdate" {
+; DOTPRE: 	label="prestackupdate";
+; DOTPRE: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> _Znam}"];
+; DOTPRE: 	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12176601099670543485\nnull call (external)}"];
+; DOTPRE: 	Node[[C]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
+; DOTPRE: 	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\nnull call (external)}"];
+; DOTPRE: 	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
+; DOTPRE: }
+
+
+; DOTPOST:digraph "postbuild" {
+; DOTPOST:	label="postbuild";
+; DOTPOST:	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> _Znam}"];
+; DOTPOST:	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"];
+; DOTPOST:	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
+; DOTPOST:	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"];
+; DOTPOST:	Node[[C]] -> Node[[D]][tooltip="ContextIds: 3",fillcolor="cyan"];
+; DOTPOST:	Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"];
+; DOTPOST:	Node[[B]] -> Node[[D]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOTPOST:	Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"];
+; DOTPOST:	Node[[E]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
+; DOTPOST:}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids2.ll
new file mode 100644
index 0000000000000..da0fd3f44b45e
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids2.ll
@@ -0,0 +1,386 @@
+;; Test callsite context graph generation for call graph with with MIBs
+;; that have pruned contexts that partially match multiple inlined
+;; callsite contexts, requiring duplication of context ids and nodes
+;; while matching callsite nodes onto the graph. This test requires more
+;; complex duplication due to multiple contexts for different allocations
+;; that share some of the same callsite nodes.
+;;
+;; Original code looks like:
+;;
+;; char *D(bool Call1) {
+;;   if (Call1)
+;;     return new char[10];
+;;   else
+;;     return new char[10];
+;; }
+;;
+;; char *C(bool Call1) {
+;;   return D(Call1);
+;; }
+;;
+;; char *B(bool Call1) {
+;;   if (Call1)
+;;     return C(true);
+;;   else
+;;     return C(false);
+;; }
+;;
+;; char *A(bool Call1) {
+;;   return B(Call1);
+;; }
+;;
+;; char *A1() {
+;;   return A(true);
+;; }
+;;
+;; char *A2() {
+;;   return A(true);
+;; }
+;;
+;; char *A3() {
+;;   return A(false);
+;; }
+;;
+;; char *A4() {
+;;   return A(false);
+;; }
+;;
+;; char *E() {
+;;   return B(true);
+;; }
+;;
+;; char *F() {
+;;   return B(false);
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *a1 = A1(); // cold
+;;   char *a2 = A2(); // cold
+;;   char *e = E(); // default
+;;   char *a3 = A3(); // default
+;;   char *a4 = A4(); // default
+;;   char *f = F(); // cold
+;;   memset(a1, 0, 10);
+;;   memset(a2, 0, 10);
+;;   memset(e, 0, 10);
+;;   memset(a3, 0, 10);
+;;   memset(a4, 0, 10);
+;;   memset(f, 0, 10);
+;;   delete[] a3;
+;;   delete[] a4;
+;;   delete[] e;
+;;   sleep(10);
+;;   delete[] a1;
+;;   delete[] a2;
+;;   delete[] f;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of A into its callers,
+;; without any other inlining or optimizations. Since both allocation contexts
+;; via A for each allocation in D have the same allocation type (cold via
+;; A1 and A2 for the first new in D, and non-cold via A3 and A4 for the second
+;; new in D, the contexts for those respective allocations are pruned above A.
+;; The allocations via E and F are to ensure we don't prune above B.
+;;
+;; The matching onto the inlined A[1234]->A sequences will require duplication
+;; of the context id assigned to the context from A for each allocation in D.
+;; This test ensures that we do this correctly in the presence of callsites
+;; shared by the different duplicated context ids (i.e. callsite in C).
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress noinline uwtable
+define ptr @_Z1Db(i1 %Call1) #0 {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !0, !callsite !5
+  br label %return
+
+if.else:                                          ; No predecessors!
+  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !6, !callsite !11
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  ret ptr null
+}
+
+; Function Attrs: nobuiltin
+declare ptr @_Znam(i64) #1
+
+define ptr @_Z1Cb(i1 %Call1) {
+entry:
+  %tobool = trunc i8 0 to i1
+  %call = call noundef ptr @_Z1Db(i1 noundef zeroext %tobool), !callsite !12
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline uwtable
+define ptr @_Z1Bb(i1 %Call1) #0 {
+entry:
+  %call = call noundef ptr @_Z1Cb(i1 noundef zeroext true), !callsite !13
+  br label %return
+
+if.else:                                          ; No predecessors!
+  %call1 = call noundef ptr @_Z1Cb(i1 noundef zeroext false), !callsite !14
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Ab(i1 %tobool) #2 {
+entry:
+  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool), !callsite !15
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline uwtable
+define ptr @_Z2A1v(i1 %tobool.i) #0 {
+entry:
+  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !16
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline uwtable
+define ptr @_Z2A2v(i1 %tobool.i) #0 {
+entry:
+  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !17
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline uwtable
+define ptr @_Z2A3v(i1 %tobool.i) #0 {
+entry:
+  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !18
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline uwtable
+define ptr @_Z2A4v(i1 %tobool.i) #0 {
+entry:
+  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !19
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline uwtable
+define ptr @_Z1Ev() #0 {
+entry:
+  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext true), !callsite !20
+  ret ptr null
+}
+
+; Function Attrs: mustprogress noinline uwtable
+define ptr @_Z1Fv() #0 {
+entry:
+  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext false), !callsite !21
+  ret ptr null
+}
+
+; Function Attrs: noinline
+declare i32 @main() #3
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
+
+declare void @_ZdaPv() #5
+
+declare i32 @sleep() #6
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0 }
+
+attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin }
+attributes #2 = { "tune-cpu"="generic" }
+attributes #3 = { noinline }
+attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #5 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #6 = { "disable-tail-calls"="true" }
+attributes #7 = { builtin allocsize(0) }
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 1905834578520680781}
+!3 = !{!4, !"cold"}
+!4 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 -6528110295079665978}
+!5 = !{i64 4854880825882961848}
+!6 = !{!7, !9}
+!7 = !{!8, !"notcold"}
+!8 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -6528110295079665978}
+!9 = !{!10, !"cold"}
+!10 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -4903163940066524832}
+!11 = !{i64 -8775068539491628272}
+!12 = !{i64 -904694911315397047}
+!13 = !{i64 6532298921261778285}
+!14 = !{i64 7859682663773658275}
+!15 = !{i64 -6528110295079665978}
+!16 = !{i64 -6528110295079665978, i64 5747919905719679568}
+!17 = !{i64 -6528110295079665978, i64 -5753238080028016843}
+!18 = !{i64 -6528110295079665978, i64 1794685869326395337}
+!19 = !{i64 -6528110295079665978, i64 5462047985461644151}
+!20 = !{i64 1905834578520680781}
+!21 = !{i64 -4903163940066524832}
+
+
+;; After adding only the alloc node memprof metadata, we only have 4 contexts (we only
+;; match the interesting parts of the pre-update graph here).
+
+; DUMP: CCG before updating call stack chains:
+; DUMP: Callsite Context Graph:
+
+; DUMP: Node [[D1:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+
+; DUMP: Node [[C:0x[a-z0-9]+]]
+; DUMP:         null Call
+; DUMP:         AllocTypes: NotColdCold
+; DUMP:         ContextIds: 1 2 3 4
+; DUMP:         CalleeEdges:
+; DUMP:                 Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP:                 Edge from Callee [[D2:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4
+
+; DUMP: Node [[D2]]
+; DUMP: 	  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+
+
+;; After updating for callsite metadata, we should have duplicated the context
+;; ids coming from node A (2 and 3) 4 times, for the 4 different callers of A,
+;; and used those on new nodes for those callers. Note that while in reality
+;; we only have cold edges coming from A1 and A2 and noncold from A3 and A4,
+;; due to the pruning we have lost this information and thus end up duplicating
+;; both of A's contexts to all of the new nodes (which could result in some
+;; unnecessary cloning.
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D1]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 5 7 9 11
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+
+; DUMP: Node [[C]]
+; DUMP: 	  %call = call noundef ptr @_Z1Db(i1 noundef zeroext %tobool)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4 5 6 7 8 9 10 11 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 		Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B1:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+
+; DUMP: Node [[B1]]
+; DUMP: 	  %call = call noundef ptr @_Z1Cb(i1 noundef zeroext true)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 5 7 9 11
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B1]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 5
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A3:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 7
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A1:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 9
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 11
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[E]]
+; DUMP: 	  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext true)	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[E]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[D2]]
+; DUMP: 	  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4 6 8 10 12
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+
+; DUMP: Node [[B2]]
+; DUMP: 	  %call1 = call noundef ptr @_Z1Cb(i1 noundef zeroext false)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4 6 8 10 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B2]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
+
+; DUMP: Node [[F]]
+; DUMP: 	  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext false)	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[F]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A2]]
+; DUMP: 	  %call = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 5 6
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A2]] AllocTypes: Cold ContextIds: 5
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A3]]
+; DUMP: 	  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 7 8
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A3]] AllocTypes: Cold ContextIds: 7
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A1]]
+; DUMP: 	  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 9 10
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A1]] AllocTypes: Cold ContextIds: 9
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A4]]
+; DUMP: 	  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 11 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A4]] AllocTypes: Cold ContextIds: 11
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A]]
+; DUMP: 	  %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 2 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	CallerEdges:
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
new file mode 100644
index 0000000000000..9ebf219dd37a0
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
@@ -0,0 +1,261 @@
+;; Tests callsite context graph generation for call graph containing indirect
+;; calls. Currently this should result in conservative behavior, such that the
+;; indirect call receives a null call in its graph node, to prevent subsequent
+;; cloning.
+;;
+;; Original code looks like:
+;;
+;; char *foo() {
+;;   return new char[10];
+;; }
+;; class A {
+;; public:
+;;     virtual char *x() { return foo(); }
+;; };
+;; class B : public A {
+;; public:
+;;     char *x() final { return foo(); }
+;; };
+;; char *bar(A *a) {
+;;   return a->x();
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   B b;
+;;   char *z = bar(&b);
+;;   char *w = bar(&b);
+;;   A a;
+;;   char *r = bar(&a);
+;;   char *s = bar(&a);
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   memset(w, 0, 10);
+;;   memset(r, 0, 10);
+;;   memset(s, 0, 10);
+;;   delete[] x;
+;;   delete[] w;
+;;   delete[] r;
+;;   sleep(10);
+;;   delete[] y;
+;;   delete[] z;
+;;   delete[] s;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; Compiled without optimization to prevent inlining and devirtualization.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare ptr @_Z3barP1A(ptr)
+
+define i32 @main(ptr %b, ptr %a) #0 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !callsite !0
+  %call1 = call noundef ptr @_Z3foov(), !callsite !1
+  %call2 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !2
+  %call3 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !3
+  %call4 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !4
+  %call5 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !5
+  ret i32 0
+}
+
+; Function Attrs: noinline
+declare void @_ZN1BC2Ev() #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: nobuiltin
+declare void @_ZdaPv() #3
+
+define internal ptr @_ZN1A1xEv() #4 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !callsite !6
+  ret ptr null
+}
+
+; Function Attrs: mustprogress uwtable
+define internal ptr @_ZN1B1xEv() #5 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !callsite !7
+  ret ptr null
+}
+
+; Function Attrs: mustprogress uwtable
+define internal ptr @_Z3foov() #5 {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !8, !callsite !21
+  ret ptr null
+}
+
+declare ptr @_Znam(i64) #6
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
+
+attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #1 = { noinline }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #3 = { nobuiltin }
+attributes #4 = { "tune-cpu"="generic" }
+attributes #5 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #6 = { "disable-tail-calls"="true" }
+attributes #7 = { builtin }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{i64 6792096022461663180}
+!3 = !{i64 -2709642582978494015}
+!4 = !{i64 748269490701775343}
+!5 = !{i64 -5747251260480066785}
+!6 = !{i64 8256774051149711748}
+!7 = !{i64 -4831879094954754638}
+!8 = !{!9, !11, !13, !15, !17, !19}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343}
+!11 = !{!12, !"cold"}
+!12 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 2732490490862098848, i64 8632435727821051414}
+!15 = !{!16, !"cold"}
+!16 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180}
+!17 = !{!18, !"notcold"}
+!18 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015}
+!19 = !{!20, !"cold"}
+!20 = !{i64 2732490490862098848, i64 -3421689549917153178}
+!21 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[FOO:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4 5 6
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6
+
+; DUMP: Node [[AX]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+;; Bar contains an indirect call, with multiple targets. It's call should be null.
+; DUMP: Node [[BAR]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 4 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5
+
+; DUMP: Node [[MAIN3]]
+; DUMP: 	  %call4 = call noundef ptr @_Z3barP1A(ptr noundef %a)	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN4]]
+; DUMP: 	  %call5 = call noundef ptr @_Z3barP1A(ptr noundef %a)	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[BX]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 4 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
+
+; DUMP: Node [[MAIN5]]
+; DUMP: 	  %call2 = call noundef ptr @_Z3barP1A(ptr noundef %b)	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN6]]
+; DUMP: 	  %call3 = call noundef ptr @_Z3barP1A(ptr noundef %b)	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 6
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"];
+; DOT: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
+; DOT: 	Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
+; DOT: 	Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"];
+; DOT: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
+; DOT: 	Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOT: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"];
+; DOT: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"];
+; DOT: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
new file mode 100644
index 0000000000000..59f135ca06627
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
@@ -0,0 +1,189 @@
+;; Test callsite context graph generation for call graph with two memprof
+;; contexts and partial inlining, requiring generation of a new fused node to
+;; represent the inlined sequence while matching callsite nodes onto the graph.
+;;
+;; Original code looks like:
+;;
+;; char *bar() {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of baz into foo, and
+;; bar into baz. Due to the inlining of bar we will initially have two
+;; allocation nodes in the graph. This tests that we correctly match
+;; foo (with baz inlined) onto the graph nodes first, and generate a new
+;; fused node for it. We should then not match baz (with bar inlined) as that
+;; is not reached by the MIB contexts (since all calls from main will look
+;; like main -> foo(+baz) -> bar after the inlining reflected in this IR).
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal ptr @_Z3barv() {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !0, !callsite !5
+  ret ptr null
+}
+
+; Function Attrs: nobuiltin
+declare ptr @_Znam(i64) #0
+
+; Function Attrs: mustprogress
+define internal ptr @_Z3bazv() #1 {
+entry:
+  %call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !0, !callsite !6
+  ret ptr null
+}
+
+; Function Attrs: noinline
+define internal ptr @_Z3foov() #2 {
+entry:
+  %call.i = call noundef ptr @_Z3barv(), !callsite !7
+  ret ptr null
+}
+
+define i32 @main() #3 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !callsite !8
+  %call1 = call noundef ptr @_Z3foov(), !callsite !9
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
+
+; Function Attrs: nounwind
+declare void @_ZdaPv() #5
+
+declare i32 @sleep() #6
+
+attributes #0 = { nobuiltin }
+attributes #1 = { mustprogress }
+attributes #2 = { noinline }
+attributes #3 = { "tune-cpu"="generic" }
+attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #5 = { nounwind }
+attributes #6 = { "disable-tail-calls"="true" }
+attributes #7 = { builtin }
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!5 = !{i64 9086428284934609951}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432}
+!7 = !{i64 -5964873800580613432, i64 2732490490862098848}
+!8 = !{i64 8632435727821051414}
+!9 = !{i64 -3421689549917153178}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+;; This is leftover from the MIB on the alloc inlined into baz. It is not
+;; matched with any call, since there is no such node in the IR. Due to the
+;; null call it will not participate in any context transformations.
+; DUMP: Node [[FOO2:0x[a-z0-9]+]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ:0x[a-z0-9]+]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 3 4
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[BAZ]]
+; DUMP: 	  %call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 3 4
+
+;; This is the node synthesized for the call to bar in foo that was created
+;; by inlining baz into foo.
+; DUMP: Node [[FOO]]
+; DUMP: 	  %call.i = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
+; DOT: 	Node[[FOO]] -> Node[[BAZ:0x[a-z0-9]+]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: 	Node[[BAZ]] [shape=record,tooltip="N[[BAZ]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc2\n_Z3bazv -\> _Znam}"];
+; DOT: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOT: 	Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll
new file mode 100644
index 0000000000000..a3a056ade8c49
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll
@@ -0,0 +1,135 @@
+;; Test callsite context graph generation for call graph with two memprof
+;; contexts and multiple levels of inlining, requiring generation of new
+;; fused nodes to represent the inlined sequence while matching callsite
+;; nodes onto the graph. In particular this tests the case where a function
+;; has inlined a callee containing an inlined callee.
+;;
+;; Original code looks like:
+;;
+;; char *bar() __attribute__((noinline)) {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; Both foo and baz are inlined into main, at both foo callsites.
+;; We should update the graph for new fused nodes for both of those inlined
+;; callsites to bar.
+;;
+;; Note that baz and bar are both dead due to the inlining, but have been left
+;; in the input IR to ensure that the MIB call chain is matched to the longer
+;; inline sequences from main.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z3barv() #0 {
+entry:
+  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12, !heapallocsite !13
+  ret ptr null
+}
+
+; Function Attrs: nobuiltin
+declare ptr @_Znam(i64) #1
+
+; Function Attrs: mustprogress
+declare ptr @_Z3bazv() #2
+
+define i32 @main() #3 {
+delete.end5:
+  %call.i.i = call noundef ptr @_Z3barv(), !callsite !14
+  %call.i.i8 = call noundef ptr @_Z3barv(), !callsite !15
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
+
+declare void @_ZdaPv() #5
+
+declare i32 @sleep() #6
+
+attributes #0 = { "stack-protector-buffer-size"="8" }
+attributes #1 = { nobuiltin }
+attributes #2 = { mustprogress }
+attributes #3 = { "tune-cpu"="generic" }
+attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #5 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #6 = { "disable-tail-calls"="true" }
+attributes #7 = { builtin }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = !{i32 7, !"Dwarf Version", i32 5}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 8, !"PIC Level", i32 2}
+!4 = !{i32 7, !"PIE Level", i32 2}
+!5 = !{i32 7, !"uwtable", i32 2}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = !{!8, !10}
+!8 = !{!9, !"notcold"}
+!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!10 = !{!11, !"cold"}
+!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!12 = !{i64 9086428284934609951}
+!13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!14 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!15 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #7, !heapallocsite !7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+;; This is the node synthesized for the first inlined call chain of main->foo->baz
+; DUMP: Node [[MAIN1]]
+; DUMP: 	  %call.i.i = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+;; This is the node synthesized for the second inlined call chain of main->foo->baz
+; DUMP: Node [[MAIN2]]
+; DUMP: 	  %call.i.i8 = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/pass-pipeline.ll b/llvm/test/Transforms/MemProfContextDisambiguation/pass-pipeline.ll
new file mode 100644
index 0000000000000..fede5fe96eccd
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/pass-pipeline.ll
@@ -0,0 +1,41 @@
+;; Test that MemProfContextDisambiguation is enabled under the expected conditions
+;; and in the expected position.
+
+;; Pass is not currently enabled by default at any opt level.
+; RUN: opt -debug-pass-manager -passes='lto<O0>' -S %s \
+; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
+; RUN: opt -debug-pass-manager -passes='lto<O1>' -S %s \
+; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
+; RUN: opt -debug-pass-manager -passes='lto<O2>' -S %s \
+; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
+; RUN: opt -debug-pass-manager -passes='lto<O3>' -S %s \
+; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
+
+;; Pass should not run even under option at O0/O1.
+; RUN: opt -debug-pass-manager -passes='lto<O0>' -S %s \
+; RUN:     -enable-memprof-context-disambiguation \
+; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
+; RUN: opt -debug-pass-manager -passes='lto<O1>' -S %s \
+; RUN:     -enable-memprof-context-disambiguation \
+; RUN:     2>&1 | FileCheck %s --implicit-check-not="Running pass: MemProfContextDisambiguation"
+
+;; Pass should be enabled under option at O2/O3.
+; RUN: opt -debug-pass-manager -passes='lto<O2>' -S %s \
+; RUN:     -enable-memprof-context-disambiguation \
+; RUN:     2>&1 | FileCheck %s --check-prefix=ENABLED
+; RUN: opt -debug-pass-manager -passes='lto<O3>' -S %s \
+; RUN:     -enable-memprof-context-disambiguation \
+; RUN:     2>&1 | FileCheck %s --check-prefix=ENABLED
+
+;; When enabled, MemProfContextDisambiguation runs just after inlining.
+; ENABLED: Running pass: InlinerPass
+; ENABLED: Invalidating analysis: InlineAdvisorAnalysis
+; ENABLED: Running pass: MemProfContextDisambiguation
+
+define noundef ptr @_Z3barv() {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10)
+  ret ptr %call
+}
+
+declare noundef nonnull ptr @_Znam(i64 noundef)

From 96449fa5b639449b720a9d84da5d8c29084b4328 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 22 Mar 2023 17:21:25 +0000
Subject: [PATCH 019/208] [gn build] Port 700cd99061ed

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
index 644d30f10854e..0dbeb793e40eb 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
@@ -48,6 +48,7 @@ static_library("IPO") {
     "Internalize.cpp",
     "LoopExtractor.cpp",
     "LowerTypeTests.cpp",
+    "MemProfContextDisambiguation.cpp",
     "MergeFunctions.cpp",
     "ModuleInliner.cpp",
     "OpenMPOpt.cpp",

From 77044a47b4dec308e02c796e7951ab1745a7f53c Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Wed, 22 Mar 2023 04:57:08 +0000
Subject: [PATCH 020/208] [CMake] Build runtimes for riscv64-unknown-fuchsia

This is necessary to have a complete RISC-V toolchain for Fuchsia.

Differential Revision: https://reviews.llvm.org/D146608
---
 clang/cmake/caches/Fuchsia-stage2.cmake | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index c874d8cacd197..037cb67e82189 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -204,7 +204,7 @@ if(FUCHSIA_SDK)
     set(BUILTINS_${target}_CMAKE_SYSROOT ${FUCHSIA_${target}_SYSROOT} CACHE PATH "")
   endforeach()
 
-  foreach(target x86_64-unknown-fuchsia;aarch64-unknown-fuchsia)
+  foreach(target x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia)
     # Set the per-target runtimes options.
     list(APPEND RUNTIME_TARGETS "${target}")
     set(RUNTIMES_${target}_CMAKE_SYSTEM_NAME Fuchsia CACHE STRING "")
@@ -276,12 +276,12 @@ if(FUCHSIA_SDK)
 
   set(LLVM_RUNTIME_MULTILIBS "asan;noexcept;compat;asan+noexcept;hwasan;hwasan+noexcept" CACHE STRING "")
 
-  set(LLVM_RUNTIME_MULTILIB_asan_TARGETS "x86_64-unknown-fuchsia;aarch64-unknown-fuchsia" CACHE STRING "")
-  set(LLVM_RUNTIME_MULTILIB_noexcept_TARGETS "x86_64-unknown-fuchsia;aarch64-unknown-fuchsia" CACHE STRING "")
-  set(LLVM_RUNTIME_MULTILIB_compat_TARGETS "x86_64-unknown-fuchsia;aarch64-unknown-fuchsia" CACHE STRING "")
-  set(LLVM_RUNTIME_MULTILIB_asan+noexcept_TARGETS "x86_64-unknown-fuchsia;aarch64-unknown-fuchsia" CACHE STRING "")
-  set(LLVM_RUNTIME_MULTILIB_hwasan_TARGETS "aarch64-unknown-fuchsia" CACHE STRING "")
-  set(LLVM_RUNTIME_MULTILIB_hwasan+noexcept_TARGETS "aarch64-unknown-fuchsia" CACHE STRING "")
+  set(LLVM_RUNTIME_MULTILIB_asan_TARGETS "x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia" CACHE STRING "")
+  set(LLVM_RUNTIME_MULTILIB_noexcept_TARGETS "x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia" CACHE STRING "")
+  set(LLVM_RUNTIME_MULTILIB_compat_TARGETS "x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia" CACHE STRING "")
+  set(LLVM_RUNTIME_MULTILIB_asan+noexcept_TARGETS "x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia" CACHE STRING "")
+  set(LLVM_RUNTIME_MULTILIB_hwasan_TARGETS "aarch64-unknown-fuchsia;riscv64-unknown-fuchsia" CACHE STRING "")
+  set(LLVM_RUNTIME_MULTILIB_hwasan+noexcept_TARGETS "aarch64-unknown-fuchsia;riscv64-unknown-fuchsia" CACHE STRING "")
 endif()
 
 set(LLVM_BUILTIN_TARGETS "${BUILTIN_TARGETS}" CACHE STRING "")

From 84de01908b58f3aa25cc3dc700a8a1b01b5263f0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 22 Mar 2023 10:24:57 -0700
Subject: [PATCH 021/208] [RISCV] Remove AnyReg RegisterClass used by .insn
 instructions. Use custom operand instead.

The fake register class interferes too much with the autogenerated
register class tables. Especially the fake spill size.

I'm working on .insn support for compressed instructions and adding
AnyRegC broke CodeGen.
---
 llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp |  6 ++++++
 llvm/lib/Target/RISCV/RISCVInstrInfo.td            | 11 +++++++++++
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td         | 12 ------------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 9c6d54e62b16c..d984f39321a6e 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -368,6 +368,12 @@ struct RISCVOperand final : public MCParsedAsmOperand {
   bool isV0Reg() const {
     return Kind == KindTy::Register && Reg.RegNum == RISCV::V0;
   }
+  bool isAnyReg() const {
+    return Kind == KindTy::Register &&
+           (RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum) ||
+            RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg.RegNum) ||
+            RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg.RegNum));
+  }
   bool isImm() const override { return Kind == KindTy::Immediate; }
   bool isMem() const override { return false; }
   bool isSystemRegister() const { return Kind == KindTy::SystemRegister; }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ab8a8a4cc9935..85c3082dce64f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1090,6 +1090,17 @@ def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF), 0>;
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
 
+def AnyRegOperand : AsmOperandClass {
+  let Name = "AnyRegOperand";
+  let RenderMethod = "addRegOperands";
+  let PredicateMethod = "isAnyReg";
+}
+
+def AnyReg : Operand<XLenVT> {
+  let OperandType = "OPERAND_REGISTER";
+  let ParserMatchClass = AnyRegOperand;
+}
+
 // isCodeGenOnly = 1 to hide them from the tablegened assembly parser.
 let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1,
     hasNoSchedulingInfo = 1 in {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 7e91441e91f47..d06453c82739e 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -578,15 +578,3 @@ foreach m = LMULList.m in {
 // Special registers
 def FFLAGS : RISCVReg<0, "fflags">;
 def FRM    : RISCVReg<0, "frm">;
-
-// Any type register. Used for .insn directives when we don't know what the
-// register types could be.
-// NOTE: The alignment and size are bogus values. The Size needs to be non-zero
-// or tablegen will use "untyped" to determine the size which will assert.
-let isAllocatable = 0 in
-def AnyReg : RegisterClass<"RISCV", [untyped], 32,
-                           (add (sequence "X%u", 0, 31),
-                                (sequence "F%u_D", 0, 31),
-                                (sequence "V%u", 0, 31))> {
-  let Size = 32;
-}

From c4125a37806aa2f663018f4f8dc5bbd5159c51c1 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 2 Mar 2023 12:04:27 -0800
Subject: [PATCH 022/208] Revert "Remove the LINK_COMPONENTS entry from
 lldb-instr CMakery"

This reverts commit e12a950d90f88aeddaa97d6e7c8fd0bfedc42f73.

D142241 broke `-sBUILD_SHARED_LIBS=ON` build. After investigations in
https://github.com/llvm/llvm-project/issues/60314, the issue that
prompted D142441 now seems gone.

Fixes https://github.com/llvm/llvm-project/issues/60314.

Reviewed By: JDevlieghere

Differential Revision: https://reviews.llvm.org/D145181
---
 lldb/tools/lldb-instr/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/tools/lldb-instr/CMakeLists.txt b/lldb/tools/lldb-instr/CMakeLists.txt
index a1bbd7e2b7c93..8da453b2894fd 100644
--- a/lldb/tools/lldb-instr/CMakeLists.txt
+++ b/lldb/tools/lldb-instr/CMakeLists.txt
@@ -11,4 +11,6 @@ add_lldb_tool(lldb-instr
     clangSerialization
     clangTooling
 
+  LINK_COMPONENTS
+    Support
   )

From 24e3102edb4a48b8754efeaecad3e474a626f35c Mon Sep 17 00:00:00 2001
From: Julian Lettner <julian.lettner@apple.com>
Date: Tue, 21 Mar 2023 16:20:22 -0700
Subject: [PATCH 023/208] [TSan] Avoid deadlock between ReportRace() and
 dlopen() interceptor

This change prevents rare deadlocks observed for specific macOS/iOS GUI
applications which issue many `dlopen()` calls from multiple different
threads at startup and where TSan finds and reports a race during
startup.  Providing a reliable test for this has been deemed infeasible.

Although I've only observed this deadlock on Apple platforms,
conceptually the cause is not confined to Apple code so the fix lives in
platform-independent code.

Deadlock scenario:
```
Thread 2                    | Thread 4
ReportRace()                |
Lock internal TSan mutexes  |
  &ctx->slot_mtx            |
                            | dlopen() interceptor
                            | OnLibraryLoaded()
                            | MemoryMappingLayout::DumpListOfModules()
                            | calls dyld API, which takes internal lock
                            | lock() interceptor
                            | TSan tries to take internal mutexes again
                            |   &ctx->slot_mtx
call into symbolizer        |
MemoryMappingLayout::DumpListOfModules()
calls dyld API, which hangs on trying to take lock
```
Resulting in:
* Thread 2 has internal TSan mutex, blocked on dyld lock
* Thread 4 has dyld lock, blocked on internal TSan mutex

The fix prevents this situation by not intercepting any of the calls
originating from `MemoryMappingLayout::DumpListOfModules()`.

Stack traces for deadlock between ReportRace() and dlopen() interceptor:
```
thread #2, queue = 'com.apple.root.default-qos'
  frame #0: libsystem_kernel.dylib
  frame #1: libclang_rt.tsan_osx_dynamic.dylib`::wrap_os_unfair_lock_lock_with_options(lock=<unavailable>, options=<unavailable>) at tsan_interceptors_mac.cpp:306:3
  frame #2: dyld`dyld4::RuntimeLocks::withLoadersReadLock(this=0x000000016f21b1e0, work=0x00000001814523c0) block_pointer) at DyldRuntimeState.cpp:227:28
  frame #3: dyld`dyld4::APIs::_dyld_get_image_header(this=0x0000000101012a20, imageIndex=614) at DyldAPIs.cpp:240:11
  frame #4: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::MemoryMappingLayout::CurrentImageHeader(this=<unavailable>) at sanitizer_procmaps_mac.cpp:391:35
  frame #5: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::MemoryMappingLayout::Next(this=0x000000016f2a2800, segment=0x000000016f2a2738) at sanitizer_procmaps_mac.cpp:397:51
  frame #6: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::MemoryMappingLayout::DumpListOfModules(this=0x000000016f2a2800, modules=0x00000001011000a0) at sanitizer_procmaps_mac.cpp:460:10
  frame #7: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::ListOfModules::init(this=0x00000001011000a0) at sanitizer_mac.cpp:610:18
  frame #8: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::Symbolizer::FindModuleForAddress(unsigned long) [inlined] __sanitizer::Symbolizer::RefreshModules(this=0x0000000101100078) at sanitizer_symbolizer_libcdep.cpp:185:12
  frame #9: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::Symbolizer::FindModuleForAddress(this=0x0000000101100078, address=6465454512) at sanitizer_symbolizer_libcdep.cpp:204:5
  frame #10: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::Symbolizer::SymbolizePC(this=0x0000000101100078, addr=6465454512) at sanitizer_symbolizer_libcdep.cpp:88:15
  frame #11: libclang_rt.tsan_osx_dynamic.dylib`__tsan::SymbolizeCode(addr=6465454512) at tsan_symbolize.cpp:106:35
  frame #12: libclang_rt.tsan_osx_dynamic.dylib`__tsan::SymbolizeStack(trace=StackTrace @ 0x0000600002d66d00) at tsan_rtl_report.cpp:112:28
  frame #13: libclang_rt.tsan_osx_dynamic.dylib`__tsan::ScopedReportBase::AddMemoryAccess(this=0x000000016f2a2a90, addr=4381057136, external_tag=<unavailable>, s=<unavailable>, tid=<unavailable>, stack=<unavailable>, mset=0x00000001012fc310) at tsan_rtl_report.cpp:190:16
  frame #14: libclang_rt.tsan_osx_dynamic.dylib`__tsan::ReportRace(thr=0x00000001012fc000, shadow_mem=0x000008020a4340e0, cur=<unavailable>, old=<unavailable>, typ0=1) at tsan_rtl_report.cpp:795:9
  frame #15: libclang_rt.tsan_osx_dynamic.dylib`__tsan::DoReportRace(thr=0x00000001012fc000, shadow_mem=0x000008020a4340e0, cur=Shadow @ x22, old=Shadow @ 0x0000600002d6b4f0, typ=1) at tsan_rtl_access.cpp:166:3
  frame #16: libclang_rt.tsan_osx_dynamic.dylib`::__tsan_read8(void *) at tsan_rtl_access.cpp:220:5
  frame #17: libclang_rt.tsan_osx_dynamic.dylib`::__tsan_read8(void *) [inlined] __tsan::MemoryAccess(thr=0x00000001012fc000, pc=<unavailable>, addr=<unavailable>, size=8, typ=1) at tsan_rtl_access.cpp:442:3
  frame #18: libclang_rt.tsan_osx_dynamic.dylib`::__tsan_read8(addr=<unavailable>) at tsan_interface.inc:34:3
  <call into TSan from from instrumented code>

thread #4, queue = 'com.apple.dock.fullscreen'
  frame #0:  libsystem_kernel.dylib
  frame #1:  libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::FutexWait(p=<unavailable>, cmp=<unavailable>) at sanitizer_mac.cpp:540:3
  frame #2:  libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::Semaphore::Wait(this=<unavailable>) at sanitizer_mutex.cpp:35:7
  frame #3:  libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::Mutex::Lock(this=0x0000000102992a80) at sanitizer_mutex.h:196:18
  frame #4:  libclang_rt.tsan_osx_dynamic.dylib`__tsan::ScopedInterceptor::~ScopedInterceptor() [inlined] __sanitizer::GenericScopedLock<__sanitizer::Mutex>::GenericScopedLock(this=<unavailable>, mu=0x0000000102992a80) at sanitizer_mutex.h:383:10
  frame #5:  libclang_rt.tsan_osx_dynamic.dylib`__tsan::ScopedInterceptor::~ScopedInterceptor() [inlined] __sanitizer::GenericScopedLock<__sanitizer::Mutex>::GenericScopedLock(this=<unavailable>, mu=0x0000000102992a80) at sanitizer_mutex.h:382:77
  frame #6:  libclang_rt.tsan_osx_dynamic.dylib`__tsan::ScopedInterceptor::~ScopedInterceptor() at tsan_rtl.h:708:10
  frame #7:  libclang_rt.tsan_osx_dynamic.dylib`__tsan::ScopedInterceptor::~ScopedInterceptor() [inlined] __tsan::TryTraceFunc(thr=0x000000010f084000, pc=0) at tsan_rtl.h:751:7
  frame #8:  libclang_rt.tsan_osx_dynamic.dylib`__tsan::ScopedInterceptor::~ScopedInterceptor() [inlined] __tsan::FuncExit(thr=0x000000010f084000) at tsan_rtl.h:798:7
  frame #9:  libclang_rt.tsan_osx_dynamic.dylib`__tsan::ScopedInterceptor::~ScopedInterceptor(this=0x000000016f3ba280) at tsan_interceptors_posix.cpp:300:5
  frame #10: libclang_rt.tsan_osx_dynamic.dylib`__tsan::ScopedInterceptor::~ScopedInterceptor(this=<unavailable>) at tsan_interceptors_posix.cpp:293:41
  frame #11: libclang_rt.tsan_osx_dynamic.dylib`::wrap_os_unfair_lock_lock_with_options(lock=0x000000016f21b1e8, options=OS_UNFAIR_LOCK_NONE) at tsan_interceptors_mac.cpp:310:1
  frame #12: dyld`dyld4::RuntimeLocks::withLoadersReadLock(this=0x000000016f21b1e0, work=0x00000001814525d4) block_pointer) at DyldRuntimeState.cpp:227:28
  frame #13: dyld`dyld4::APIs::_dyld_get_image_vmaddr_slide(this=0x0000000101012a20, imageIndex=412) at DyldAPIs.cpp:273:11
  frame #14: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::MemoryMappingLayout::Next(__sanitizer::MemoryMappedSegment*) at sanitizer_procmaps_mac.cpp:286:17
  frame #15: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::MemoryMappingLayout::Next(this=0x000000016f3ba560, segment=0x000000016f3ba498) at sanitizer_procmaps_mac.cpp:432:15
  frame #16: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::MemoryMappingLayout::DumpListOfModules(this=0x000000016f3ba560, modules=0x000000016f3ba618) at sanitizer_procmaps_mac.cpp:460:10
  frame #17: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::ListOfModules::init(this=0x000000016f3ba618) at sanitizer_mac.cpp:610:18
  frame #18: libclang_rt.tsan_osx_dynamic.dylib`__sanitizer::LibIgnore::OnLibraryLoaded(this=0x0000000101f3aa40, name="<some library>") at sanitizer_libignore.cpp:54:11
  frame #19: libclang_rt.tsan_osx_dynamic.dylib`::wrap_dlopen(filename="<some library>", flag=<unavailable>) at sanitizer_common_interceptors.inc:6466:3
  <library code>
```

rdar://106766395

Differential Revision: https://reviews.llvm.org/D146593
---
 .../lib/tsan/rtl/tsan_interceptors_posix.cpp     | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index 97aa4b77311f1..6ac6ac6a7fb4c 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -2497,11 +2497,21 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
     res;                                          \
   })
 
+// Ignore interceptors in OnLibraryLoaded()/Unloaded().  These hooks use code
+// (ListOfModules::init, MemoryMappingLayout::DumpListOfModules) that make
+// intercepted calls, which can cause deadlockes with ReportRace() which also
+// uses this code.
 #define COMMON_INTERCEPTOR_LIBRARY_LOADED(filename, handle) \
-  libignore()->OnLibraryLoaded(filename)
+  ({                                                        \
+    ScopedIgnoreInterceptors ignore_interceptors;           \
+    libignore()->OnLibraryLoaded(filename);                 \
+  })
 
-#define COMMON_INTERCEPTOR_LIBRARY_UNLOADED() \
-  libignore()->OnLibraryUnloaded()
+#define COMMON_INTERCEPTOR_LIBRARY_UNLOADED()     \
+  ({                                              \
+    ScopedIgnoreInterceptors ignore_interceptors; \
+    libignore()->OnLibraryUnloaded();             \
+  })
 
 #define COMMON_INTERCEPTOR_ACQUIRE(ctx, u) \
   Acquire(((TsanInterceptorContext *) ctx)->thr, pc, u)

From 16b7cf245ec0ff5428daee4f71af62e1938bfc73 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Tue, 21 Mar 2023 12:03:54 -0400
Subject: [PATCH 024/208] SymbolFile: ensure that we have a value before
 invoking `getBitWidth`

Ensure that the variant returned by `member->getValue()` has a value and
is not `Empty`.  Failure to do so will trigger an assertion failure in
`llvm::pdb::Variant::getBitWidth()`.  This can occur when the `static`
member is a forward declaration.

Differential Revision: https://reviews.llvm.org/D146536
Reviewed By: sgraenitz
---
 lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
index da57338ffb58a..b1a882465c404 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
@@ -1299,6 +1299,15 @@ void PDBASTParser::AddRecordMembers(
       // Query the symbol's value as the variable initializer if valid.
       if (member_comp_type.IsConst()) {
         auto value = member->getValue();
+        if (value.Type == llvm::pdb::Empty) {
+          LLDB_LOG(GetLog(LLDBLog::AST),
+                   "Class '{0}' has member '{1}' of type '{2}' with an unknown "
+                   "constant size.",
+                   record_type.GetTypeName(), member_name,
+                   member_comp_type.GetTypeName());
+          continue;
+        }
+
         clang::QualType qual_type = decl->getType();
         unsigned type_width = m_ast.getASTContext().getIntWidth(qual_type);
         unsigned constant_width = value.getBitWidth();

From ead9644684e85e0611f3b0ff72926820f1277e09 Mon Sep 17 00:00:00 2001
From: Emilia Dreamer <emilia@rymiel.space>
Date: Wed, 22 Mar 2023 20:26:38 +0200
Subject: [PATCH 025/208] [clang-format] Annotate noexcept, explicit specifiers
 as containing expressions

The noexcept specifier and explicit specifier can optionally include a
boolean expression to make these specifiers apply conditionally,
however, clang-format didn't set the context for the parenthesized
content of these specifiers, meaning they inherited the parent context,
which usually isn't an expressions, leading to misannotated binary
operators.

This patch applies expression context to the content of these
specifiers, making them similar to the static_assert keyword.

Fixes https://github.com/llvm/llvm-project/issues/44543

Reviewed By: owenpan, MyDeveloperDay

Differential Revision: https://reviews.llvm.org/D146284
---
 clang/lib/Format/TokenAnnotator.cpp           |  7 ++++---
 clang/unittests/Format/FormatTest.cpp         |  4 ++++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 11 +++++++++++
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index c5644c5bfea16..55be50aec203e 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -318,9 +318,10 @@ class AnnotatingParser {
       // export type X = (...);
       Contexts.back().IsExpression = false;
     } else if (OpeningParen.Previous &&
-               (OpeningParen.Previous->isOneOf(tok::kw_static_assert,
-                                               tok::kw_while, tok::l_paren,
-                                               tok::comma, TT_BinaryOperator) ||
+               (OpeningParen.Previous->isOneOf(
+                    tok::kw_static_assert, tok::kw_noexcept, tok::kw_explicit,
+                    tok::kw_while, tok::l_paren, tok::comma,
+                    TT_BinaryOperator) ||
                 OpeningParen.Previous->isIf())) {
       // static_assert, if and while usually contain expressions.
       Contexts.back().IsExpression = true;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 1beb6a75c5225..eeb1234999a10 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -11592,6 +11592,10 @@ TEST_F(FormatTest, UnderstandsRvalueReferences) {
   verifyFormat("template <bool B, bool C> class A {\n"
                "  static_assert(B && C, \"Something is wrong\");\n"
                "};");
+  verifyFormat("template <typename T> void swap() noexcept(Bar<T> && Foo<T>);");
+  verifyFormat("template <typename T> struct S {\n"
+               "  explicit(Bar<T> && Foo<T>) S(const S &);\n"
+               "};");
   verifyGoogleFormat("#define IF(a, b, c) if (a && (b == c))");
   verifyGoogleFormat("#define WHILE(a, b, c) while (a && (b == c))");
   verifyFormat("#define A(a, b) (a && b)");
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index bc8f7f36372d2..3a6fb0e9e4b3f 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -242,6 +242,17 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
                     "}");
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::amp, TT_BinaryOperator);
+
+  Tokens =
+      annotate("template <typename T> void swap() noexcept(Bar<T> && Foo<T>);");
+  ASSERT_EQ(Tokens.size(), 23u) << Tokens;
+  EXPECT_TOKEN(Tokens[15], tok::ampamp, TT_BinaryOperator);
+
+  Tokens = annotate("template <typename T> struct S {\n"
+                    "  explicit(Bar<T> && Foo<T>) S(const S &);\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 30u) << Tokens;
+  EXPECT_TOKEN(Tokens[14], tok::ampamp, TT_BinaryOperator);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsUsesOfPlusAndMinus) {

From 984354fbbe4e207798f6d83c6f46b7603952dd36 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Tue, 21 Mar 2023 15:30:32 -0700
Subject: [PATCH 026/208] [lldb] Update some uses of Python2 API in typemaps.

Python 3 doesn't have a distinction between PyInt and PyLong, it's all
PyLong now.

This also fixes a bug in SetNumberFromObject. This used to crash LLDB:
```
lldb -o "script data=lldb.SBData(); data.SetDataFromUInt64Array([2**63])"
```

The problem happened in the PyInt path:
```
  if (PyInt_Check(obj))
      number = static_cast<T>(PyInt_AsLong(obj));
```
when obj doesn't fit in a signed long, `PyInt_AsLong` would fail with
"OverflowError: Python int too large to convert to C long".

The existing long path does the right thing, as it will call
`PyLong_AsUnsignedLongLong` for uint64_t.

Differential Revision: https://reviews.llvm.org/D146590
---
 lldb/bindings/python/python-typemaps.swig     | 18 +++++++-----------
 lldb/test/API/python_api/sbdata/TestSBData.py |  3 ++-
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/lldb/bindings/python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig
index b3e7dd4ee265a..3e9675c8c00f1 100644
--- a/lldb/bindings/python/python-typemaps.swig
+++ b/lldb/bindings/python/python-typemaps.swig
@@ -103,11 +103,11 @@
 
 // typemap for a char buffer
 %typemap(in) (char *dst, size_t dst_len) {
-  if (!PyInt_Check($input)) {
+  if (!PyLong_Check($input)) {
     PyErr_SetString(PyExc_ValueError, "Expecting an integer");
     SWIG_fail;
   }
-  $2 = PyInt_AsLong($input);
+  $2 = PyLong_AsLong($input);
   if ($2 <= 0) {
     PyErr_SetString(PyExc_ValueError, "Positive integer expected");
     SWIG_fail;
@@ -139,11 +139,11 @@
 
 // typemap for handling an snprintf-like API like SBThread::GetStopDescription.
 %typemap(in) (char *dst_or_null, size_t dst_len) {
-  if (!PyInt_Check($input)) {
+  if (!PyLong_Check($input)) {
     PyErr_SetString(PyExc_ValueError, "Expecting an integer");
     SWIG_fail;
   }
-  $2 = PyInt_AsLong($input);
+  $2 = PyLong_AsLong($input);
   if ($2 <= 0) {
     PyErr_SetString(PyExc_ValueError, "Positive integer expected");
     SWIG_fail;
@@ -205,9 +205,7 @@
 // typemap for an incoming buffer
 // See also SBProcess::ReadMemory.
 %typemap(in) (void *buf, size_t size) {
-  if (PyInt_Check($input)) {
-    $2 = PyInt_AsLong($input);
-  } else if (PyLong_Check($input)) {
+  if (PyLong_Check($input)) {
     $2 = PyLong_AsLong($input);
   } else {
     PyErr_SetString(PyExc_ValueError, "Expecting an integer or long object");
@@ -258,9 +256,7 @@ template <> int32_t PyLongAsT<int32_t>(PyObject *obj) {
 }
 
 template <class T> bool SetNumberFromPyObject(T &number, PyObject *obj) {
-  if (PyInt_Check(obj))
-    number = static_cast<T>(PyInt_AsLong(obj));
-  else if (PyLong_Check(obj))
+  if (PyLong_Check(obj))
     number = PyLongAsT<T>(obj);
   else
     return false;
@@ -345,7 +341,7 @@ template <> bool SetNumberFromPyObject<double>(double &number, PyObject *obj) {
     count = $2;
   PyObject *list = PyList_New(count);
   for (uint32_t j = 0; j < count; j++) {
-    PyObject *item = PyInt_FromLong($1[j]);
+    PyObject *item = PyLong_FromLong($1[j]);
     int ok = PyList_SetItem(list, j, item);
     if (ok != 0) {
       $result = Py_None;
diff --git a/lldb/test/API/python_api/sbdata/TestSBData.py b/lldb/test/API/python_api/sbdata/TestSBData.py
index 932781b9b1b0f..ba839590c1a36 100644
--- a/lldb/test/API/python_api/sbdata/TestSBData.py
+++ b/lldb/test/API/python_api/sbdata/TestSBData.py
@@ -387,12 +387,13 @@ def test_with_run_command(self):
         self.assert_data(data2.GetUnsignedInt8, 4, 111)
         self.assert_data(data2.GetUnsignedInt8, 5, 33)
 
-        data2.SetDataFromUInt64Array([1, 2, 3, 4, 5])
+        data2.SetDataFromUInt64Array([1, 2, 3, 4, 5, 2**63])
         self.assert_data(data2.GetUnsignedInt64, 0, 1)
         self.assert_data(data2.GetUnsignedInt64, 8, 2)
         self.assert_data(data2.GetUnsignedInt64, 16, 3)
         self.assert_data(data2.GetUnsignedInt64, 24, 4)
         self.assert_data(data2.GetUnsignedInt64, 32, 5)
+        self.assert_data(data2.GetUnsignedInt64, 40, 2**63)
 
         self.assertEqual(
             data2.uint64[0], 1,

From 38d69df5c2dad0d4ceb08d840840ab083dd673fe Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 22 Mar 2023 11:36:02 -0700
Subject: [PATCH 027/208] [Driver][test] Fix avr-ld.c for
 -DCLANG_DEFAULT_LINKER=lld after D145646

---
 clang/test/Driver/avr-ld.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/test/Driver/avr-ld.c b/clang/test/Driver/avr-ld.c
index 3088bc00446f4..4042ecb89adf5 100644
--- a/clang/test/Driver/avr-ld.c
+++ b/clang/test/Driver/avr-ld.c
@@ -44,16 +44,16 @@
 // RUN: %clang -### --target=avr -mmcu=atxmega128a1 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKO %s
 // LINKO: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega7"}} {{.*}} "--defsym=__DATA_REGION_ORIGIN__=0x802000" "--start-group" {{.*}} "-latxmega128a1" {{.*}} "--end-group" "--relax" "-mavrxmega7"
 
-// RUN: %clang -### --target=avr -mmcu=atmega328 -flto --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKP %s
+// RUN: %clang -### --target=avr -mmcu=atmega328 -fuse-ld=ld -flto --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck --check-prefix=LINKP %s
 // LINKP: {{".*ld.*"}} {{.*}} "--defsym=__DATA_REGION_ORIGIN__=0x800100" "-plugin" {{.*}}  "-plugin-opt=mcpu=atmega328"
 
-// RUN: %clang -### --target=avr -flto --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKQ %s
+// RUN: %clang -### --target=avr -fuse-ld=ld -flto --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck --check-prefix=LINKQ %s
 // LINKQ: {{".*ld.*"}} {{.*}} "-plugin"
 // LINKQ-NOT: "-plugin-opt=mcpu"
 
-// RUN: %clang -### --target=avr -mmcu=atmega328 -flto=thin --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKR %s
-// LINKR: {{".*ld.*"}} {{.*}} "--defsym=__DATA_REGION_ORIGIN__=0x800100" "-plugin" {{.*}}  "-plugin-opt=mcpu=atmega328" "-plugin-opt=thinlto"
+// RUN: %clang -### --target=avr -mmcu=atmega328 -fuse-ld=lld -flto=thin --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKR %s
+// LINKR: {{".*ld.*"}} {{.*}} "--defsym=__DATA_REGION_ORIGIN__=0x800100" "-plugin-opt=mcpu=atmega328" "-plugin-opt=thinlto"
 
-// RUN: %clang -### --target=avr -mmcu=atmega328 -flto --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKS %s
-// LINKS: {{".*ld.*"}} {{.*}} "--defsym=__DATA_REGION_ORIGIN__=0x800100" "-plugin" {{.*}}  "-plugin-opt=mcpu=atmega328"
+// RUN: %clang -### --target=avr -mmcu=atmega328 -fuse-ld=lld -flto --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKS %s
+// LINKS: {{".*ld.*"}} {{.*}} "--defsym=__DATA_REGION_ORIGIN__=0x800100" "-plugin-opt=mcpu=atmega328"
 // LINKS-NOT: "-plugin-opt=thinlto"

From eac8e25ea5ee64ea46f93bba42d842fbde61609c Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 22 Mar 2023 16:56:05 +0000
Subject: [PATCH 028/208] [CodeGen] Fix type of
 MachineRegisterInfo::RegAllocHints. NFC.

The first member of the pair should be unsigned instead of Register
because it is the hint type, 0 for simple (target independent) hints and
other values for target dependent hints.

Differential Revision: https://reviews.llvm.org/D146646
---
 llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 17 ++++++++---------
 llvm/lib/CodeGen/CalcSpillWeights.cpp           |  2 +-
 llvm/lib/CodeGen/TargetRegisterInfo.cpp         |  4 ++--
 llvm/lib/CodeGen/VirtRegMap.cpp                 |  6 +++---
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp     |  4 ++--
 .../deltas/ReduceVirtualRegisters.cpp           |  2 +-
 6 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index ce447be3af41f..fc4e5ca756248 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -101,8 +101,9 @@ class MachineRegisterInfo {
   /// first member of the pair being non-zero. If the hinted register is
   /// virtual, it means the allocator should prefer the physical register
   /// allocated to it if any.
-  IndexedMap<std::pair<Register, SmallVector<Register, 4>>,
-             VirtReg2IndexFunctor> RegAllocHints;
+  IndexedMap<std::pair<unsigned, SmallVector<Register, 4>>,
+             VirtReg2IndexFunctor>
+      RegAllocHints;
 
   /// PhysRegUseDefLists - This is an array of the head of the use/def list for
   /// physical registers.
@@ -818,27 +819,25 @@ class MachineRegisterInfo {
   /// getRegAllocationHint - Return the register allocation hint for the
   /// specified virtual register. If there are many hints, this returns the
   /// one with the greatest weight.
-  std::pair<Register, Register>
-  getRegAllocationHint(Register VReg) const {
+  std::pair<unsigned, Register> getRegAllocationHint(Register VReg) const {
     assert(VReg.isVirtual());
     Register BestHint = (RegAllocHints[VReg.id()].second.size() ?
                          RegAllocHints[VReg.id()].second[0] : Register());
-    return std::pair<Register, Register>(RegAllocHints[VReg.id()].first,
-                                         BestHint);
+    return {RegAllocHints[VReg.id()].first, BestHint};
   }
 
   /// getSimpleHint - same as getRegAllocationHint except it will only return
   /// a target independent hint.
   Register getSimpleHint(Register VReg) const {
     assert(VReg.isVirtual());
-    std::pair<Register, Register> Hint = getRegAllocationHint(VReg);
+    std::pair<unsigned, Register> Hint = getRegAllocationHint(VReg);
     return Hint.first ? Register() : Hint.second;
   }
 
   /// getRegAllocationHints - Return a reference to the vector of all
   /// register allocation hints for VReg.
-  const std::pair<Register, SmallVector<Register, 4>>
-  &getRegAllocationHints(Register VReg) const {
+  const std::pair<unsigned, SmallVector<Register, 4>> &
+  getRegAllocationHints(Register VReg) const {
     assert(VReg.isVirtual());
     return RegAllocHints[VReg];
   }
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 1146c1d465da5..5a005ba7b414d 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -157,7 +157,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
   unsigned NumInstr = 0; // Number of instructions using LI
   SmallPtrSet<MachineInstr *, 8> Visited;
 
-  std::pair<Register, Register> TargetHint = MRI.getRegAllocationHint(LI.reg());
+  std::pair<unsigned, Register> TargetHint = MRI.getRegAllocationHint(LI.reg());
 
   if (LI.isSpillable()) {
     Register Reg = LI.reg();
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index e6baf00c06451..051de1612284c 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -424,8 +424,8 @@ bool TargetRegisterInfo::getRegAllocationHints(
     SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
     const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const std::pair<Register, SmallVector<Register, 4>> &Hints_MRI =
-    MRI.getRegAllocationHints(VirtReg);
+  const std::pair<unsigned, SmallVector<Register, 4>> &Hints_MRI =
+      MRI.getRegAllocationHints(VirtReg);
 
   SmallSet<Register, 32> HintedRegs;
   // First hint may be a target hint.
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index f80b06d7e9b7c..8e00712d2308e 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -116,10 +116,10 @@ bool VirtRegMap::hasPreferredPhys(Register VirtReg) const {
 }
 
 bool VirtRegMap::hasKnownPreference(Register VirtReg) const {
-  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(VirtReg);
-  if (Register::isPhysicalRegister(Hint.second))
+  std::pair<unsigned, Register> Hint = MRI->getRegAllocationHint(VirtReg);
+  if (Hint.second.isPhysical())
     return true;
-  if (Register::isVirtualRegister(Hint.second))
+  if (Hint.second.isVirtual())
     return hasPhys(Hint.second);
   return false;
 }
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e6c6ab2efd50e..0fc2d8c6f5712 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -338,7 +338,7 @@ bool ARMBaseRegisterInfo::getRegAllocationHints(
     SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
     const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  std::pair<Register, Register> Hint = MRI.getRegAllocationHint(VirtReg);
+  std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
 
   unsigned Odd;
   switch (Hint.first) {
@@ -391,7 +391,7 @@ bool ARMBaseRegisterInfo::getRegAllocationHints(
 void ARMBaseRegisterInfo::updateRegAllocHint(Register Reg, Register NewReg,
                                              MachineFunction &MF) const {
   MachineRegisterInfo *MRI = &MF.getRegInfo();
-  std::pair<Register, Register> Hint = MRI->getRegAllocationHint(Reg);
+  std::pair<unsigned, Register> Hint = MRI->getRegAllocationHint(Reg);
   if ((Hint.first == ARMRI::RegPairOdd || Hint.first == ARMRI::RegPairEven) &&
       Hint.second.isVirtual()) {
     // If 'Reg' is one of the even / odd register pair and it's now changed
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.cpp b/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.cpp
index eed5be7054e41..2b97e65bbf093 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.cpp
@@ -23,7 +23,7 @@ static void dropRegisterHintsFromFunction(Oracle &O, MachineFunction &MF) {
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
 
-    const std::pair<Register, SmallVector<Register, 4>> &Hints =
+    const std::pair<unsigned, SmallVector<Register, 4>> &Hints =
         MRI.getRegAllocationHints(Reg);
     if (Hints.second.empty())
       continue;

From 7e5c48b8bd9ff0ee5de3ba28c833f1225f14e44d Mon Sep 17 00:00:00 2001
From: Carlos Galvez <carlosgalvezp@gmail.com>
Date: Wed, 22 Mar 2023 18:38:10 +0000
Subject: [PATCH 029/208] [clang-tidy][NFC] Move
 avoid-underscore-in-googletest-name to google folder

Since the check belongs to the google module, it makes sense
that the corresponding test also belongs to the google module.

Differential Revision: https://reviews.llvm.org/D146653
---
 .../avoid-underscore-in-googletest-name.cpp                       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename clang-tools-extra/test/clang-tidy/checkers/{readability => google}/avoid-underscore-in-googletest-name.cpp (100%)

diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/avoid-underscore-in-googletest-name.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp
similarity index 100%
rename from clang-tools-extra/test/clang-tidy/checkers/readability/avoid-underscore-in-googletest-name.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp

From 6afcc54ac7d68fa2b28f0e7cbf9dc1d4ac7fb95e Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 22 Mar 2023 09:51:58 -0700
Subject: [PATCH 030/208] [SCEV] Infer no-self-wrap via constant ranges

Without this, pointer IVs in loops with small constant trip counts couldn't be proven no-self-wrap. This came up in a new LSR transform, but may also benefit other SCEV consumers as well.

Differential Revision: https://reviews.llvm.org/D146596
---
 llvm/lib/Analysis/ScalarEvolution.cpp                | 12 ++++++++++++
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp    |  9 +--------
 .../Analysis/ScalarEvolution/different-loops-recs.ll |  2 +-
 .../max-backedge-taken-count-guard-info.ll           |  9 ++++-----
 .../Transforms/LoopRotate/pr51981-scev-problem.ll    |  4 ++--
 llvm/test/Transforms/LoopVersioning/lcssa.ll         |  1 -
 6 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index df525f4d6be7a..df872f61906c8 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -4988,6 +4988,18 @@ ScalarEvolution::proveNoWrapViaConstantRanges(const SCEVAddRecExpr *AR) {
 
   SCEV::NoWrapFlags Result = SCEV::FlagAnyWrap;
 
+  if (!AR->hasNoSelfWrap()) {
+    const SCEV *BECount = getConstantMaxBackedgeTakenCount(AR->getLoop());
+    if (const SCEVConstant *BECountMax = dyn_cast<SCEVConstant>(BECount)) {
+      ConstantRange StepCR = getSignedRange(AR->getStepRecurrence(*this));
+      const APInt &BECountAP = BECountMax->getAPInt();
+      unsigned NoOverflowBitWidth =
+        BECountAP.getActiveBits() + StepCR.getMinSignedBits();
+      if (NoOverflowBitWidth <= getTypeSizeInBits(AR->getType()))
+        Result = ScalarEvolution::setFlags(Result, SCEV::FlagNW);
+    }
+  }
+
   if (!AR->hasNoSignedWrap()) {
     ConstantRange AddRecRange = getSignedRange(AR);
     ConstantRange IncRange = getSignedRange(AR->getStepRecurrence(*this));
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 82312de71c72c..fbdc436e0d37e 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6769,14 +6769,7 @@ canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     // iteration. The simplest case to consider is a candidate IV which is
     // narrower than the trip count (and thus original IV), but this can
     // also happen due to non-unit strides on the candidate IVs.
-    // TODO: This check should be replaceable with PostInc->hasNoSelfWrap(),
-    // but in practice we appear to be missing inference for cases we should
-    // be able to catch.
-    ConstantRange StepCR = SE.getSignedRange(AddRec->getStepRecurrence(SE));
-    ConstantRange BECountCR = SE.getUnsignedRange(BECount);
-    unsigned NoOverflowBitWidth = BECountCR.getActiveBits() + StepCR.getMinSignedBits();
-    unsigned ARBitWidth = SE.getTypeSizeInBits(AddRec->getType());
-    if (NoOverflowBitWidth > ARBitWidth)
+    if (!AddRec->hasNoSelfWrap())
       continue;
 
     const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);
diff --git a/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll b/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll
index 44081f32d0af6..60b2e9d50dd59 100644
--- a/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll
+++ b/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll
@@ -94,7 +94,7 @@ define void @test_01(i32 %a, i32 %b) {
 ; CHECK:       %s3 = add i32 %is1, %phi5
 ; CHECK-NEXT:  -->  {{{{}}(59 + (2 * %a) + %b),+,6}<%loop1>,+,2}<nw><%loop2>
 ; CHECK:       %s4 = add i32 %phi2, %is2
-; CHECK-NEXT:  -->  {{{{}}(159 + (2 * %b)),+,2}<%loop1>,+,6}<%loop2>
+; CHECK-NEXT:  -->  {{{{}}(159 + (2 * %b)),+,2}<nw><%loop1>,+,6}<%loop2>
 ; CHECK:       %s5 = add i32 %is1, %is2
 ; CHECK-NEXT:  -->  {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
 ; CHECK:       %s6 = add i32 %is2, %is1
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
index e1acec162d3c8..d4d3a9e13e277 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
@@ -1633,9 +1633,9 @@ define i32 @ptr_induction_ult_1(ptr %a, ptr %b) {
 ; CHECK-LABEL: 'ptr_induction_ult_1'
 ; CHECK-NEXT:  Classifying expressions for: @ptr_induction_ult_1
 ; CHECK-NEXT:    %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ]
-; CHECK-NEXT:    --> {%a,+,4}<%loop> U: full-set S: full-set Exits: %a LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%a,+,4}<nw><%loop> U: full-set S: full-set Exits: %a LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    --> {(4 + %a),+,4}<%loop> U: full-set S: full-set Exits: (4 + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(4 + %a),+,4}<nw><%loop> U: full-set S: full-set Exits: (4 + %a) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_ult_1
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is 0
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is 0
@@ -1686,18 +1686,17 @@ exit:
   ret i32 0
 }
 
-; TODO: The pointer induction variable can be implied No Self Wrap.
 define void @gep_addrec_nw(ptr %a) {
 ; CHECK-LABEL: 'gep_addrec_nw'
 ; CHECK-NEXT:  Classifying expressions for: @gep_addrec_nw
 ; CHECK-NEXT:    %lsr.iv1 = phi ptr [ %uglygep2, %for.body ], [ %a, %entry ]
-; CHECK-NEXT:    --> {%a,+,4}<%for.body> U: full-set S: full-set Exits: (1512 + %a) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    --> {%a,+,4}<nw><%for.body> U: full-set S: full-set Exits: (1512 + %a) LoopDispositions: { %for.body: Computable }
 ; CHECK-NEXT:    %lsr.iv = phi i64 [ %lsr.iv.next, %for.body ], [ 379, %entry ]
 ; CHECK-NEXT:    --> {379,+,-1}<nsw><%for.body> U: [1,380) S: [1,380) Exits: 1 LoopDispositions: { %for.body: Computable }
 ; CHECK-NEXT:    %lsr.iv.next = add nsw i64 %lsr.iv, -1
 ; CHECK-NEXT:    --> {378,+,-1}<nsw><%for.body> U: [0,379) S: [0,379) Exits: 0 LoopDispositions: { %for.body: Computable }
 ; CHECK-NEXT:    %uglygep2 = getelementptr i8, ptr %lsr.iv1, i64 4
-; CHECK-NEXT:    --> {(4 + %a),+,4}<%for.body> U: full-set S: full-set Exits: (1516 + %a) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    --> {(4 + %a),+,4}<nw><%for.body> U: full-set S: full-set Exits: (1516 + %a) LoopDispositions: { %for.body: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @gep_addrec_nw
 ; CHECK-NEXT:  Loop %for.body: backedge-taken count is 378
 ; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is 378
diff --git a/llvm/test/Transforms/LoopRotate/pr51981-scev-problem.ll b/llvm/test/Transforms/LoopRotate/pr51981-scev-problem.ll
index dd5031c56722f..2c2c88b6acb20 100644
--- a/llvm/test/Transforms/LoopRotate/pr51981-scev-problem.ll
+++ b/llvm/test/Transforms/LoopRotate/pr51981-scev-problem.ll
@@ -22,7 +22,7 @@
 ; CHECK-SCEV:   %narrow = trunc i32 %wide to i16
 ; CHECK-SCEV:   -->  (trunc i32 %wide to i16) U: full-set S: full-set               Exits: <<Unknown>>              LoopDispositions: { %loop.outer.header: Variant, %loop.inner: Invariant }
 ; CHECK-SCEV:   %iv = phi i16 [ %narrow, %loop.inner.ph ], [ %iv.plus, %loop.inner ]
-; CHECK-SCEV:   -->  {(trunc i32 %wide to i16),+,1}<%loop.inner> U: full-set S: full-set           Exits: (-1 + (700 umax (1 + (trunc i32 %wide to i16))))               LoopDispositions: { %loop.inner: Computable, %loop.outer.header: Variant }
+; CHECK-SCEV:   -->  {(trunc i32 %wide to i16),+,1}<nw><%loop.inner> U: full-set S: full-set           Exits: (-1 + (700 umax (1 + (trunc i32 %wide to i16))))               LoopDispositions: { %loop.inner: Computable, %loop.outer.header: Variant }
 ;
 ; CHECK-SCEV: Classifying expressions for: @test_function
 ; CHECK-SCEV:   %wide1 = load i32, ptr @offset, align 1
@@ -32,7 +32,7 @@
 ; CHECK-SCEV:   %narrow = trunc i32 %wide2 to i16
 ; CHECK-SCEV:   -->  (trunc i32 %wide2 to i16) U: full-set S: full-set               Exits: <<Unknown>>              LoopDispositions: { %loop.inner.ph: Variant, %loop.inner: Invariant }
 ; CHECK-SCEV:   %iv = phi i16 [ %narrow, %loop.inner.ph ], [ %iv.plus, %loop.inner ]
-; CHECK-SCEV:   -->  {(trunc i32 %wide2 to i16),+,1}<%loop.inner> U: full-set S: full-set           Exits: (-1 + (700 umax (1 + (trunc i32 %wide2 to i16))))               LoopDispositions: { %loop.inner: Computable, %loop.inner.ph: Variant }
+; CHECK-SCEV:   -->  {(trunc i32 %wide2 to i16),+,1}<nw><%loop.inner> U: full-set S: full-set           Exits: (-1 + (700 umax (1 + (trunc i32 %wide2 to i16))))               LoopDispositions: { %loop.inner: Computable, %loop.inner.ph: Variant }
 
 
 @offset = external dso_local global i32, align 1
diff --git a/llvm/test/Transforms/LoopVersioning/lcssa.ll b/llvm/test/Transforms/LoopVersioning/lcssa.ll
index ee14f693abd7a..4b51c21257243 100644
--- a/llvm/test/Transforms/LoopVersioning/lcssa.ll
+++ b/llvm/test/Transforms/LoopVersioning/lcssa.ll
@@ -56,7 +56,6 @@ define void @fill_no_null_opt(i8** %ls1.20, i8** %ls2.21, i8* %cse3.22) #0 {
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[LS2_21_PROMOTED]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, i8* [[LS1_20_PROMOTED]], i64 -1
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %bb1.ph.lver.orig, label %bb1.ph
 ; CHECK:       bb1.ph.lver.orig:
 ;

From e655d8a54880cf550567dda0e9a1a33f6ee98df5 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 9 Mar 2023 01:14:03 +0100
Subject: [PATCH 031/208] [libc++] Granularize __mutex_base

This also updates the moved code to the current style. (i.e. `_VSTD` -> `std`, `_LIBCPP_INLINE_VISIBILITY` -> `_LIBCPP_HIDE_FROM_ABI`, clang-format).

Reviewed By: Mordante, #libc, EricWF

Spies: arichardson, libcxx-commits, mikhail.ramalho

Differential Revision: https://reviews.llvm.org/D146228
---
 libcxx/docs/ReleaseNotes.rst                  |   3 +-
 libcxx/include/CMakeLists.txt                 |   6 +-
 .../__condition_variable/condition_variable.h | 243 ++++++++++++++++++
 libcxx/include/__mutex/lock_guard.h           |  53 ++++
 libcxx/include/__mutex/mutex.h                |  53 ++++
 libcxx/include/__mutex/tag_types.h            |  48 ++++
 libcxx/include/__mutex/unique_lock.h          | 172 +++++++++++++
 libcxx/include/condition_variable             |  17 +-
 libcxx/include/libcxx.imp                     |   2 +
 libcxx/include/module.modulemap.in            |  12 +-
 libcxx/include/mutex                          |  14 +-
 libcxx/include/shared_mutex                   |  13 +-
 libcxx/include/thread                         |   9 +-
 libcxx/src/shared_mutex.cpp                   |   1 +
 libcxx/test/libcxx/private_headers.verify.cpp |   6 +-
 .../native_handle.pass.cpp                    |   4 +-
 .../test/libcxx/transitive_includes/cxx03.csv |   8 +
 .../test/libcxx/transitive_includes/cxx11.csv |   8 +
 .../test/libcxx/transitive_includes/cxx14.csv |   1 +
 .../test/libcxx/transitive_includes/cxx17.csv |   1 +
 .../test/libcxx/transitive_includes/cxx20.csv |   1 +
 .../test/libcxx/transitive_includes/cxx2b.csv |   9 +-
 .../futures.shared_future/wait.pass.cpp       |   1 +
 .../futures.unique_future/wait.pass.cpp       |   1 +
 .../thread.lock.shared.cons/mutex.pass.cpp    |   5 +-
 .../mutex_try_to_lock.pass.cpp                |   6 +-
 .../thread.lock.shared.locking/lock.pass.cpp  |   6 +-
 .../try_lock.pass.cpp                         |   3 +-
 .../try_lock_for.pass.cpp                     |   3 +-
 .../try_lock_until.pass.cpp                   |   3 +-
 .../thread.lock.unique.cons/mutex.pass.cpp    |   5 +-
 .../mutex_try_to_lock.pass.cpp                |   5 +-
 .../thread.lock.unique.locking/lock.pass.cpp  |   5 +-
 .../try_lock.pass.cpp                         |   3 +-
 .../try_lock_for.pass.cpp                     |   3 +-
 .../try_lock_until.pass.cpp                   |   3 +-
 .../unlock.pass.cpp                           |   3 +-
 .../thread.mutex.class/lock.pass.cpp          |   5 +-
 .../thread.mutex.class/try_lock.pass.cpp      |   5 +-
 .../thread.mutex.recursive/lock.pass.cpp      |   5 +-
 .../thread.mutex.recursive/try_lock.pass.cpp  |   5 +-
 .../thread.shared_mutex.class/lock.pass.cpp   |   5 +-
 .../lock_shared.pass.cpp                      |   5 +-
 .../try_lock.pass.cpp                         |   5 +-
 .../try_lock_shared.pass.cpp                  |   5 +-
 .../lock.pass.cpp                             |   6 +-
 .../lock_shared.pass.cpp                      |   3 +-
 .../try_lock.pass.cpp                         |   5 +-
 .../try_lock_shared.pass.cpp                  |   5 +-
 .../thread.timedmutex.class/lock.pass.cpp     |   5 +-
 .../thread.timedmutex.class/try_lock.pass.cpp |   5 +-
 .../thread.timedmutex.recursive/lock.pass.cpp |   5 +-
 .../try_lock.pass.cpp                         |   5 +-
 .../thread.thread.this/sleep_until.pass.cpp   |   5 +-
 .../time.duration.nonmember/ostream.pass.cpp  |   1 +
 .../time.hms.nonmembers/ostream.pass.cpp      |   3 +-
 .../time/time.syn/formatter.duration.pass.cpp |   1 +
 .../time/time.syn/formatter.hh_mm_ss.pass.cpp |   1 +
 libcxx/utils/data/ignore_format.txt           |   1 -
 59 files changed, 757 insertions(+), 73 deletions(-)
 create mode 100644 libcxx/include/__condition_variable/condition_variable.h
 create mode 100644 libcxx/include/__mutex/lock_guard.h
 create mode 100644 libcxx/include/__mutex/mutex.h
 create mode 100644 libcxx/include/__mutex/tag_types.h
 create mode 100644 libcxx/include/__mutex/unique_lock.h

diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
index aa14e6bfcd97d..8083ba337fc16 100644
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -62,7 +62,8 @@ Deprecations and Removals
   includes are removed based on the language version used. Incidental transitive
   inclusions of the following headers have been removed:
 
-  - C++2b: ``atomic``, ``bit``, ``cstring``, ``type_traits``
+  - C++2b: ``atomic``, ``bit``, ``cstdint``, ``cstdlib``, ``cstring``, ``initializer_list``, ``new``, ``stdexcept``,
+           ``type_traits``, ``typeinfo``
 
 - The headers ``<experimental/algorithm>`` and ``<experimental/functional>`` have been removed, since all the contents
   have been implemented in namespace ``std`` for at least two releases.
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index f8c52328ccff0..8232784cb6c7e 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -294,6 +294,7 @@ set(files
   __concepts/semiregular.h
   __concepts/swappable.h
   __concepts/totally_ordered.h
+  __condition_variable/condition_variable.h
   __config
   __coroutine/coroutine_handle.h
   __coroutine/coroutine_traits.h
@@ -474,7 +475,10 @@ set(files
   __memory_resource/pool_options.h
   __memory_resource/synchronized_pool_resource.h
   __memory_resource/unsynchronized_pool_resource.h
-  __mutex_base
+  __mutex/lock_guard.h
+  __mutex/mutex.h
+  __mutex/tag_types.h
+  __mutex/unique_lock.h
   __node_handle
   __numeric/accumulate.h
   __numeric/adjacent_difference.h
diff --git a/libcxx/include/__condition_variable/condition_variable.h b/libcxx/include/__condition_variable/condition_variable.h
new file mode 100644
index 0000000000000..e66f78725a08c
--- /dev/null
+++ b/libcxx/include/__condition_variable/condition_variable.h
@@ -0,0 +1,243 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CONDITION_VARIABLE_CONDITION_VARIABLE_H
+#define _LIBCPP___CONDITION_VARIABLE_CONDITION_VARIABLE_H
+
+#include <__chrono/steady_clock.h>
+#include <__chrono/system_clock.h>
+#include <__chrono/time_point.h>
+#include <__config>
+#include <__mutex/mutex.h>
+#include <__mutex/unique_lock.h>
+#include <__threading_support>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_floating_point.h>
+#include <__utility/move.h>
+#include <ratio>
+#include <system_error>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#ifndef _LIBCPP_HAS_NO_THREADS
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// enum class cv_status
+_LIBCPP_DECLARE_STRONG_ENUM(cv_status){no_timeout, timeout};
+_LIBCPP_DECLARE_STRONG_ENUM_EPILOG(cv_status)
+
+class _LIBCPP_TYPE_VIS condition_variable {
+  __libcpp_condvar_t __cv_ = _LIBCPP_CONDVAR_INITIALIZER;
+
+public:
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR condition_variable() _NOEXCEPT = default;
+
+#  ifdef _LIBCPP_HAS_TRIVIAL_CONDVAR_DESTRUCTION
+  ~condition_variable() = default;
+#  else
+  ~condition_variable();
+#  endif
+
+  condition_variable(const condition_variable&)            = delete;
+  condition_variable& operator=(const condition_variable&) = delete;
+
+  void notify_one() _NOEXCEPT;
+  void notify_all() _NOEXCEPT;
+
+  void wait(unique_lock<mutex>& __lk) _NOEXCEPT;
+  template <class _Predicate>
+  _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS void wait(unique_lock<mutex>& __lk, _Predicate __pred);
+
+  template <class _Clock, class _Duration>
+  _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS cv_status
+  wait_until(unique_lock<mutex>& __lk, const chrono::time_point<_Clock, _Duration>& __t);
+
+  template <class _Clock, class _Duration, class _Predicate>
+  _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS bool
+  wait_until(unique_lock<mutex>& __lk, const chrono::time_point<_Clock, _Duration>& __t, _Predicate __pred);
+
+  template <class _Rep, class _Period>
+  _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS cv_status
+  wait_for(unique_lock<mutex>& __lk, const chrono::duration<_Rep, _Period>& __d);
+
+  template <class _Rep, class _Period, class _Predicate>
+  bool _LIBCPP_HIDE_FROM_ABI
+  wait_for(unique_lock<mutex>& __lk, const chrono::duration<_Rep, _Period>& __d, _Predicate __pred);
+
+  typedef __libcpp_condvar_t* native_handle_type;
+  _LIBCPP_HIDE_FROM_ABI native_handle_type native_handle() { return &__cv_; }
+
+private:
+  void
+  __do_timed_wait(unique_lock<mutex>& __lk, chrono::time_point<chrono::system_clock, chrono::nanoseconds>) _NOEXCEPT;
+#  if defined(_LIBCPP_HAS_COND_CLOCKWAIT)
+  void
+  __do_timed_wait(unique_lock<mutex>& __lk, chrono::time_point<chrono::steady_clock, chrono::nanoseconds>) _NOEXCEPT;
+#  endif
+  template <class _Clock>
+  void __do_timed_wait(unique_lock<mutex>& __lk, chrono::time_point<_Clock, chrono::nanoseconds>) _NOEXCEPT;
+};
+#endif // !_LIBCPP_HAS_NO_THREADS
+
+template <class _Rep, class _Period>
+inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<is_floating_point<_Rep>::value, chrono::nanoseconds>
+__safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d) {
+  using namespace chrono;
+  using __ratio       = ratio_divide<_Period, nano>;
+  using __ns_rep      = nanoseconds::rep;
+  _Rep __result_float = __d.count() * __ratio::num / __ratio::den;
+
+  _Rep __result_max = numeric_limits<__ns_rep>::max();
+  if (__result_float >= __result_max) {
+    return nanoseconds::max();
+  }
+
+  _Rep __result_min = numeric_limits<__ns_rep>::min();
+  if (__result_float <= __result_min) {
+    return nanoseconds::min();
+  }
+
+  return nanoseconds(static_cast<__ns_rep>(__result_float));
+}
+
+template <class _Rep, class _Period>
+inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<!is_floating_point<_Rep>::value, chrono::nanoseconds>
+__safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d) {
+  using namespace chrono;
+  if (__d.count() == 0) {
+    return nanoseconds(0);
+  }
+
+  using __ratio         = ratio_divide<_Period, nano>;
+  using __ns_rep        = nanoseconds::rep;
+  __ns_rep __result_max = numeric_limits<__ns_rep>::max();
+  if (__d.count() > 0 && __d.count() > __result_max / __ratio::num) {
+    return nanoseconds::max();
+  }
+
+  __ns_rep __result_min = numeric_limits<__ns_rep>::min();
+  if (__d.count() < 0 && __d.count() < __result_min / __ratio::num) {
+    return nanoseconds::min();
+  }
+
+  __ns_rep __result = __d.count() * __ratio::num / __ratio::den;
+  if (__result == 0) {
+    return nanoseconds(1);
+  }
+
+  return nanoseconds(__result);
+}
+
+#ifndef _LIBCPP_HAS_NO_THREADS
+template <class _Predicate>
+void condition_variable::wait(unique_lock<mutex>& __lk, _Predicate __pred) {
+  while (!__pred())
+    wait(__lk);
+}
+
+template <class _Clock, class _Duration>
+cv_status condition_variable::wait_until(unique_lock<mutex>& __lk, const chrono::time_point<_Clock, _Duration>& __t) {
+  using namespace chrono;
+  using __clock_tp_ns = time_point<_Clock, nanoseconds>;
+
+  typename _Clock::time_point __now = _Clock::now();
+  if (__t <= __now)
+    return cv_status::timeout;
+
+  __clock_tp_ns __t_ns = __clock_tp_ns(std::__safe_nanosecond_cast(__t.time_since_epoch()));
+
+  __do_timed_wait(__lk, __t_ns);
+  return _Clock::now() < __t ? cv_status::no_timeout : cv_status::timeout;
+}
+
+template <class _Clock, class _Duration, class _Predicate>
+bool condition_variable::wait_until(
+    unique_lock<mutex>& __lk, const chrono::time_point<_Clock, _Duration>& __t, _Predicate __pred) {
+  while (!__pred()) {
+    if (wait_until(__lk, __t) == cv_status::timeout)
+      return __pred();
+  }
+  return true;
+}
+
+template <class _Rep, class _Period>
+cv_status condition_variable::wait_for(unique_lock<mutex>& __lk, const chrono::duration<_Rep, _Period>& __d) {
+  using namespace chrono;
+  if (__d <= __d.zero())
+    return cv_status::timeout;
+  using __ns_rep                   = nanoseconds::rep;
+  steady_clock::time_point __c_now = steady_clock::now();
+
+#  if defined(_LIBCPP_HAS_COND_CLOCKWAIT)
+  using __clock_tp_ns     = time_point<steady_clock, nanoseconds>;
+  __ns_rep __now_count_ns = std::__safe_nanosecond_cast(__c_now.time_since_epoch()).count();
+#  else
+  using __clock_tp_ns     = time_point<system_clock, nanoseconds>;
+  __ns_rep __now_count_ns = std::__safe_nanosecond_cast(system_clock::now().time_since_epoch()).count();
+#  endif
+
+  __ns_rep __d_ns_count = std::__safe_nanosecond_cast(__d).count();
+
+  if (__now_count_ns > numeric_limits<__ns_rep>::max() - __d_ns_count) {
+    __do_timed_wait(__lk, __clock_tp_ns::max());
+  } else {
+    __do_timed_wait(__lk, __clock_tp_ns(nanoseconds(__now_count_ns + __d_ns_count)));
+  }
+
+  return steady_clock::now() - __c_now < __d ? cv_status::no_timeout : cv_status::timeout;
+}
+
+template <class _Rep, class _Period, class _Predicate>
+inline bool
+condition_variable::wait_for(unique_lock<mutex>& __lk, const chrono::duration<_Rep, _Period>& __d, _Predicate __pred) {
+  return wait_until(__lk, chrono::steady_clock::now() + __d, std::move(__pred));
+}
+
+#  if defined(_LIBCPP_HAS_COND_CLOCKWAIT)
+inline void condition_variable::__do_timed_wait(
+    unique_lock<mutex>& __lk, chrono::time_point<chrono::steady_clock, chrono::nanoseconds> __tp) _NOEXCEPT {
+  using namespace chrono;
+  if (!__lk.owns_lock())
+    __throw_system_error(EPERM, "condition_variable::timed wait: mutex not locked");
+  nanoseconds __d = __tp.time_since_epoch();
+  timespec __ts;
+  seconds __s                 = duration_cast<seconds>(__d);
+  using __ts_sec              = decltype(__ts.tv_sec);
+  const __ts_sec __ts_sec_max = numeric_limits<__ts_sec>::max();
+  if (__s.count() < __ts_sec_max) {
+    __ts.tv_sec  = static_cast<__ts_sec>(__s.count());
+    __ts.tv_nsec = (__d - __s).count();
+  } else {
+    __ts.tv_sec  = __ts_sec_max;
+    __ts.tv_nsec = giga::num - 1;
+  }
+  int __ec = pthread_cond_clockwait(&__cv_, __lk.mutex()->native_handle(), CLOCK_MONOTONIC, &__ts);
+  if (__ec != 0 && __ec != ETIMEDOUT)
+    __throw_system_error(__ec, "condition_variable timed_wait failed");
+}
+#  endif // _LIBCPP_HAS_COND_CLOCKWAIT
+
+template <class _Clock>
+inline void condition_variable::__do_timed_wait(unique_lock<mutex>& __lk,
+                                                chrono::time_point<_Clock, chrono::nanoseconds> __tp) _NOEXCEPT {
+  wait_for(__lk, __tp - _Clock::now());
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_HAS_NO_THREADS
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___CONDITION_VARIABLE_CONDITION_VARIABLE_H
diff --git a/libcxx/include/__mutex/lock_guard.h b/libcxx/include/__mutex/lock_guard.h
new file mode 100644
index 0000000000000..c075512fb97a9
--- /dev/null
+++ b/libcxx/include/__mutex/lock_guard.h
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MUTEX_LOCK_GUARD_H
+#define _LIBCPP___MUTEX_LOCK_GUARD_H
+
+#include <__config>
+#include <__mutex/tag_types.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#ifndef _LIBCPP_HAS_NO_THREADS
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Mutex>
+class _LIBCPP_TEMPLATE_VIS _LIBCPP_THREAD_SAFETY_ANNOTATION(scoped_lockable) lock_guard {
+public:
+  typedef _Mutex mutex_type;
+
+private:
+  mutex_type& __m_;
+
+public:
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI explicit lock_guard(mutex_type& __m)
+      _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
+      : __m_(__m) {
+    __m_.lock();
+  }
+
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI lock_guard(mutex_type& __m, adopt_lock_t)
+      _LIBCPP_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
+      : __m_(__m) {}
+  _LIBCPP_HIDE_FROM_ABI ~lock_guard() _LIBCPP_THREAD_SAFETY_ANNOTATION(release_capability()) { __m_.unlock(); }
+
+private:
+  lock_guard(lock_guard const&)            = delete;
+  lock_guard& operator=(lock_guard const&) = delete;
+};
+_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(lock_guard);
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_HAS_NO_THREADS
+
+#endif // _LIBCPP___MUTEX_LOCK_GUARD_H
diff --git a/libcxx/include/__mutex/mutex.h b/libcxx/include/__mutex/mutex.h
new file mode 100644
index 0000000000000..13def1e5d1535
--- /dev/null
+++ b/libcxx/include/__mutex/mutex.h
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MUTEX_MUTEX_H
+#define _LIBCPP___MUTEX_MUTEX_H
+
+#include <__config>
+#include <__threading_support>
+#include <__type_traits/is_nothrow_default_constructible.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#ifndef _LIBCPP_HAS_NO_THREADS
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+class _LIBCPP_TYPE_VIS _LIBCPP_THREAD_SAFETY_ANNOTATION(capability("mutex")) mutex {
+  __libcpp_mutex_t __m_ = _LIBCPP_MUTEX_INITIALIZER;
+
+public:
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR mutex() = default;
+
+  mutex(const mutex&)            = delete;
+  mutex& operator=(const mutex&) = delete;
+
+#  if defined(_LIBCPP_HAS_TRIVIAL_MUTEX_DESTRUCTION)
+  ~mutex() = default;
+#  else
+  ~mutex() _NOEXCEPT;
+#  endif
+
+  void lock() _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability());
+  bool try_lock() _NOEXCEPT _LIBCPP_THREAD_SAFETY_ANNOTATION(try_acquire_capability(true));
+  void unlock() _NOEXCEPT _LIBCPP_THREAD_SAFETY_ANNOTATION(release_capability());
+
+  typedef __libcpp_mutex_t* native_handle_type;
+  _LIBCPP_HIDE_FROM_ABI native_handle_type native_handle() { return &__m_; }
+};
+
+static_assert(is_nothrow_default_constructible<mutex>::value, "the default constructor for std::mutex must be nothrow");
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_HAS_NO_THREADS
+
+#endif // _LIBCPP___MUTEX_MUTEX_H
diff --git a/libcxx/include/__mutex/tag_types.h b/libcxx/include/__mutex/tag_types.h
new file mode 100644
index 0000000000000..02cf007ae1219
--- /dev/null
+++ b/libcxx/include/__mutex/tag_types.h
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MUTEX_TAG_TYPES_H
+#define _LIBCPP___MUTEX_TAG_TYPES_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#ifndef _LIBCPP_HAS_NO_THREADS
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct _LIBCPP_TYPE_VIS defer_lock_t {
+  explicit defer_lock_t() = default;
+};
+
+struct _LIBCPP_TYPE_VIS try_to_lock_t {
+  explicit try_to_lock_t() = default;
+};
+
+struct _LIBCPP_TYPE_VIS adopt_lock_t {
+  explicit adopt_lock_t() = default;
+};
+
+#  if defined(_LIBCPP_BUILDING_LIBRARY)
+extern _LIBCPP_EXPORTED_FROM_ABI const defer_lock_t defer_lock;
+extern _LIBCPP_EXPORTED_FROM_ABI const try_to_lock_t try_to_lock;
+extern _LIBCPP_EXPORTED_FROM_ABI const adopt_lock_t adopt_lock;
+#  elif !defined(_LIBCPP_CXX03_LANG)
+/* inline */ constexpr defer_lock_t defer_lock   = defer_lock_t();
+/* inline */ constexpr try_to_lock_t try_to_lock = try_to_lock_t();
+/* inline */ constexpr adopt_lock_t adopt_lock   = adopt_lock_t();
+#  endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_HAS_NO_THREADS
+
+#endif // _LIBCPP___MUTEX_TAG_TYPES_H
diff --git a/libcxx/include/__mutex/unique_lock.h b/libcxx/include/__mutex/unique_lock.h
new file mode 100644
index 0000000000000..a057d1c69d3ef
--- /dev/null
+++ b/libcxx/include/__mutex/unique_lock.h
@@ -0,0 +1,172 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MUTEX_UNIQUE_LOCK_H
+#define _LIBCPP___MUTEX_UNIQUE_LOCK_H
+
+#include <__chrono/duration.h>
+#include <__chrono/time_point.h>
+#include <__config>
+#include <__memory/addressof.h>
+#include <__mutex/tag_types.h>
+#include <__utility/swap.h>
+#include <system_error>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#ifndef _LIBCPP_HAS_NO_THREADS
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Mutex>
+class _LIBCPP_TEMPLATE_VIS unique_lock {
+public:
+  typedef _Mutex mutex_type;
+
+private:
+  mutex_type* __m_;
+  bool __owns_;
+
+public:
+  _LIBCPP_HIDE_FROM_ABI unique_lock() _NOEXCEPT : __m_(nullptr), __owns_(false) {}
+  _LIBCPP_HIDE_FROM_ABI explicit unique_lock(mutex_type& __m) : __m_(std::addressof(__m)), __owns_(true) {
+    __m_->lock();
+  }
+
+  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, defer_lock_t) _NOEXCEPT
+      : __m_(std::addressof(__m)),
+        __owns_(false) {}
+
+  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, try_to_lock_t)
+      : __m_(std::addressof(__m)), __owns_(__m.try_lock()) {}
+
+  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, adopt_lock_t) : __m_(std::addressof(__m)), __owns_(true) {}
+
+  template <class _Clock, class _Duration>
+  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::time_point<_Clock, _Duration>& __t)
+      : __m_(std::addressof(__m)), __owns_(__m.try_lock_until(__t)) {}
+
+  template <class _Rep, class _Period>
+  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::duration<_Rep, _Period>& __d)
+      : __m_(std::addressof(__m)), __owns_(__m.try_lock_for(__d)) {}
+
+  _LIBCPP_HIDE_FROM_ABI ~unique_lock() {
+    if (__owns_)
+      __m_->unlock();
+  }
+
+  unique_lock(unique_lock const&)            = delete;
+  unique_lock& operator=(unique_lock const&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI unique_lock(unique_lock&& __u) _NOEXCEPT : __m_(__u.__m_), __owns_(__u.__owns_) {
+    __u.__m_    = nullptr;
+    __u.__owns_ = false;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI unique_lock& operator=(unique_lock&& __u) _NOEXCEPT {
+    if (__owns_)
+      __m_->unlock();
+
+    __m_        = __u.__m_;
+    __owns_     = __u.__owns_;
+    __u.__m_    = nullptr;
+    __u.__owns_ = false;
+    return *this;
+  }
+
+  void lock();
+  bool try_lock();
+
+  template <class _Rep, class _Period>
+  bool try_lock_for(const chrono::duration<_Rep, _Period>& __d);
+
+  template <class _Clock, class _Duration>
+  bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t);
+
+  void unlock();
+
+  _LIBCPP_HIDE_FROM_ABI void swap(unique_lock& __u) _NOEXCEPT {
+    std::swap(__m_, __u.__m_);
+    std::swap(__owns_, __u.__owns_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI mutex_type* release() _NOEXCEPT {
+    mutex_type* __m = __m_;
+    __m_            = nullptr;
+    __owns_         = false;
+    return __m;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool owns_lock() const _NOEXCEPT { return __owns_; }
+  _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __owns_; }
+  _LIBCPP_HIDE_FROM_ABI mutex_type* mutex() const _NOEXCEPT { return __m_; }
+};
+_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(unique_lock);
+
+template <class _Mutex>
+void unique_lock<_Mutex>::lock() {
+  if (__m_ == nullptr)
+    __throw_system_error(EPERM, "unique_lock::lock: references null mutex");
+  if (__owns_)
+    __throw_system_error(EDEADLK, "unique_lock::lock: already locked");
+  __m_->lock();
+  __owns_ = true;
+}
+
+template <class _Mutex>
+bool unique_lock<_Mutex>::try_lock() {
+  if (__m_ == nullptr)
+    __throw_system_error(EPERM, "unique_lock::try_lock: references null mutex");
+  if (__owns_)
+    __throw_system_error(EDEADLK, "unique_lock::try_lock: already locked");
+  __owns_ = __m_->try_lock();
+  return __owns_;
+}
+
+template <class _Mutex>
+template <class _Rep, class _Period>
+bool unique_lock<_Mutex>::try_lock_for(const chrono::duration<_Rep, _Period>& __d) {
+  if (__m_ == nullptr)
+    __throw_system_error(EPERM, "unique_lock::try_lock_for: references null mutex");
+  if (__owns_)
+    __throw_system_error(EDEADLK, "unique_lock::try_lock_for: already locked");
+  __owns_ = __m_->try_lock_for(__d);
+  return __owns_;
+}
+
+template <class _Mutex>
+template <class _Clock, class _Duration>
+bool unique_lock<_Mutex>::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t) {
+  if (__m_ == nullptr)
+    __throw_system_error(EPERM, "unique_lock::try_lock_until: references null mutex");
+  if (__owns_)
+    __throw_system_error(EDEADLK, "unique_lock::try_lock_until: already locked");
+  __owns_ = __m_->try_lock_until(__t);
+  return __owns_;
+}
+
+template <class _Mutex>
+void unique_lock<_Mutex>::unlock() {
+  if (!__owns_)
+    __throw_system_error(EPERM, "unique_lock::unlock: not locked");
+  __m_->unlock();
+  __owns_ = false;
+}
+
+template <class _Mutex>
+inline _LIBCPP_HIDE_FROM_ABI void swap(unique_lock<_Mutex>& __x, unique_lock<_Mutex>& __y) _NOEXCEPT {
+  __x.swap(__y);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_HAS_NO_THREADS
+
+#endif // _LIBCPP___MUTEX_UNIQUE_LOCK_H
diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable
index bb7b13c6fe3eb..e1eec6066ec2e 100644
--- a/libcxx/include/condition_variable
+++ b/libcxx/include/condition_variable
@@ -107,10 +107,18 @@ public:
 */
 
 #include <__assert> // all public C++ headers provide the assertion handler
+#include <__chrono/duration.h>
+#include <__chrono/steady_clock.h>
+#include <__chrono/time_point.h>
+#include <__condition_variable/condition_variable.h>
 #include <__config>
 #include <__memory/shared_ptr.h>
 #include <__memory/unique_ptr.h>
-#include <__mutex_base>
+#include <__mutex/lock_guard.h>
+#include <__mutex/mutex.h>
+#include <__mutex/tag_types.h>
+#include <__mutex/unique_lock.h>
+#include <__utility/move.h>
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -271,7 +279,14 @@ _LIBCPP_END_NAMESPACE_STD
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <atomic>
 #  include <concepts>
+#  include <cstdint>
+#  include <cstdlib>
+#  include <cstring>
+#  include <initializer_list>
+#  include <new>
+#  include <stdexcept>
 #  include <type_traits>
+#  include <typeinfo>
 #endif
 
 #endif // _LIBCPP_CONDITION_VARIABLE
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index d85ae36953a1e..b9a670c7be50b 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -22,6 +22,7 @@
   { include: [ "@<__chrono/.*>", "private", "<chrono>", "public" ] },
   { include: [ "@<__compare/.*>", "private", "<compare>", "public" ] },
   { include: [ "@<__concepts/.*>", "private", "<concepts>", "public" ] },
+  { include: [ "@<__condition_variable/.*>", "private", "<condition_variable>", "public" ] },
   { include: [ "@<__coroutine/.*>", "private", "<coroutine>", "public" ] },
   { include: [ "@<__debug_utils/.*>", "private", "<debug_utils>", "public" ] },
   { include: [ "@<__exception/.*>", "private", "<exception>", "public" ] },
@@ -34,6 +35,7 @@
   { include: [ "@<__iterator/.*>", "private", "<iterator>", "public" ] },
   { include: [ "@<__memory/.*>", "private", "<memory>", "public" ] },
   { include: [ "@<__memory_resource/.*>", "private", "<memory_resource>", "public" ] },
+  { include: [ "@<__mutex/.*>", "private", "<mutex>", "public" ] },
   { include: [ "@<__numeric/.*>", "private", "<numeric>", "public" ] },
   { include: [ "@<__random/.*>", "private", "<random>", "public" ] },
   { include: [ "@<__ranges/.*>", "private", "<ranges>", "public" ] },
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 89a03cbb341d3..7bae70fafee70 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -813,6 +813,10 @@ module std [system] {
   module condition_variable {
     header "condition_variable"
     export *
+
+    module __condition_variable {
+      module condition_variable { private header "__condition_variable/condition_variable.h" }
+    }
   }
   module coroutine {
     header "coroutine"
@@ -1174,6 +1178,13 @@ module std [system] {
     @requires_LIBCXX_ENABLE_THREADS@
     header "mutex"
     export *
+
+    module __mutex {
+      module lock_guard  { private header "__mutex/lock_guard.h" }
+      module mutex       { private header "__mutex/mutex.h" }
+      module tag_types   { private header "__mutex/tag_types.h" }
+      module unique_lock { private header "__mutex/unique_lock.h" }
+    }
   }
   module new {
     header "new"
@@ -1695,7 +1706,6 @@ module std [system] {
     private header "__locale" export *
   }
   module __mbstate_t         { private header "__mbstate_t.h"       export * }
-  module __mutex_base        { private header "__mutex_base"        export * }
   module __node_handle       { private header "__node_handle"       export * }
   module __split_buffer      { private header "__split_buffer"      export * }
   module __std_stream        {
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index 9d24768d6c0e2..3e89aaa4e6e6e 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -187,9 +187,15 @@ template<class Callable, class ...Args>
 */
 
 #include <__assert> // all public C++ headers provide the assertion handler
+#include <__chrono/steady_clock.h>
+#include <__chrono/time_point.h>
+#include <__condition_variable/condition_variable.h>
 #include <__config>
 #include <__memory/shared_ptr.h>
-#include <__mutex_base>
+#include <__mutex/lock_guard.h>
+#include <__mutex/mutex.h>
+#include <__mutex/tag_types.h>
+#include <__mutex/unique_lock.h>
 #include <__threading_support>
 #include <__utility/forward.h>
 #include <cstdint>
@@ -706,9 +712,15 @@ _LIBCPP_POP_MACROS
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <atomic>
 #  include <concepts>
+#  include <cstdlib>
+#  include <cstring>
 #  include <ctime>
 #  include <functional>
+#  include <initializer_list>
+#  include <new>
+#  include <stdexcept>
 #  include <type_traits>
+#  include <typeinfo>
 #endif
 
 #endif // _LIBCPP_MUTEX
diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex
index 550f8344ae19a..dd142f4d53600 100644
--- a/libcxx/include/shared_mutex
+++ b/libcxx/include/shared_mutex
@@ -124,7 +124,18 @@ template <class Mutex>
 
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
+#include <__chrono/duration.h>
+#include <__chrono/steady_clock.h>
+#include <__chrono/time_point.h>
+#include <__condition_variable/condition_variable.h>
 #include <__config>
+#include <__memory/addressof.h>
+#include <__mutex/mutex.h>
+#include <__mutex/tag_types.h>
+#include <__mutex/unique_lock.h>
+#include <__utility/swap.h>
+#include <cerrno>
+#include <system_error>
 #include <version>
 
 _LIBCPP_PUSH_MACROS
@@ -133,8 +144,6 @@ _LIBCPP_PUSH_MACROS
 
 #if _LIBCPP_STD_VER >= 14 || defined(_LIBCPP_BUILDING_LIBRARY)
 
-#include <__mutex_base>
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
diff --git a/libcxx/include/thread b/libcxx/include/thread
index 13e722ca9476a..19c8c2df89fd4 100644
--- a/libcxx/include/thread
+++ b/libcxx/include/thread
@@ -84,12 +84,16 @@ void sleep_for(const chrono::duration<Rep, Period>& rel_time);
 */
 
 #include <__assert> // all public C++ headers provide the assertion handler
+#include <__chrono/steady_clock.h>
+#include <__chrono/time_point.h>
+#include <__condition_variable/condition_variable.h>
 #include <__config>
 #include <__exception/terminate.h>
 #include <__functional/hash.h>
 #include <__memory/addressof.h>
 #include <__memory/unique_ptr.h>
-#include <__mutex_base>
+#include <__mutex/mutex.h>
+#include <__mutex/unique_lock.h>
 #include <__thread/poll_with_backoff.h>
 #include <__thread/timed_backoff_policy.h>
 #include <__threading_support>
@@ -416,7 +420,10 @@ _LIBCPP_POP_MACROS
 #endif
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cstdint>
+#  include <cstring>
 #  include <functional>
+#  include <new>
 #endif
 
 #endif // _LIBCPP_THREAD
diff --git a/libcxx/src/shared_mutex.cpp b/libcxx/src/shared_mutex.cpp
index 73d4dc1c1c7a9..b1976c11d7ef2 100644
--- a/libcxx/src/shared_mutex.cpp
+++ b/libcxx/src/shared_mutex.cpp
@@ -10,6 +10,7 @@
 
 #ifndef _LIBCPP_HAS_NO_THREADS
 
+#include <mutex>
 #include <shared_mutex>
 #if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #  pragma comment(lib, "pthread")
diff --git a/libcxx/test/libcxx/private_headers.verify.cpp b/libcxx/test/libcxx/private_headers.verify.cpp
index dd2a655642e51..263c05a8b2ce5 100644
--- a/libcxx/test/libcxx/private_headers.verify.cpp
+++ b/libcxx/test/libcxx/private_headers.verify.cpp
@@ -328,6 +328,7 @@ END-SCRIPT
 #include <__concepts/semiregular.h> // expected-error@*:* {{use of private header from outside its module: '__concepts/semiregular.h'}}
 #include <__concepts/swappable.h> // expected-error@*:* {{use of private header from outside its module: '__concepts/swappable.h'}}
 #include <__concepts/totally_ordered.h> // expected-error@*:* {{use of private header from outside its module: '__concepts/totally_ordered.h'}}
+#include <__condition_variable/condition_variable.h> // expected-error@*:* {{use of private header from outside its module: '__condition_variable/condition_variable.h'}}
 #include <__coroutine/coroutine_handle.h> // expected-error@*:* {{use of private header from outside its module: '__coroutine/coroutine_handle.h'}}
 #include <__coroutine/coroutine_traits.h> // expected-error@*:* {{use of private header from outside its module: '__coroutine/coroutine_traits.h'}}
 #include <__coroutine/noop_coroutine_handle.h> // expected-error@*:* {{use of private header from outside its module: '__coroutine/noop_coroutine_handle.h'}}
@@ -505,7 +506,10 @@ END-SCRIPT
 #include <__memory_resource/pool_options.h> // expected-error@*:* {{use of private header from outside its module: '__memory_resource/pool_options.h'}}
 #include <__memory_resource/synchronized_pool_resource.h> // expected-error@*:* {{use of private header from outside its module: '__memory_resource/synchronized_pool_resource.h'}}
 #include <__memory_resource/unsynchronized_pool_resource.h> // expected-error@*:* {{use of private header from outside its module: '__memory_resource/unsynchronized_pool_resource.h'}}
-#include <__mutex_base> // expected-error@*:* {{use of private header from outside its module: '__mutex_base'}}
+#include <__mutex/lock_guard.h> // expected-error@*:* {{use of private header from outside its module: '__mutex/lock_guard.h'}}
+#include <__mutex/mutex.h> // expected-error@*:* {{use of private header from outside its module: '__mutex/mutex.h'}}
+#include <__mutex/tag_types.h> // expected-error@*:* {{use of private header from outside its module: '__mutex/tag_types.h'}}
+#include <__mutex/unique_lock.h> // expected-error@*:* {{use of private header from outside its module: '__mutex/unique_lock.h'}}
 #include <__node_handle> // expected-error@*:* {{use of private header from outside its module: '__node_handle'}}
 #include <__numeric/accumulate.h> // expected-error@*:* {{use of private header from outside its module: '__numeric/accumulate.h'}}
 #include <__numeric/adjacent_difference.h> // expected-error@*:* {{use of private header from outside its module: '__numeric/adjacent_difference.h'}}
diff --git a/libcxx/test/libcxx/thread/thread.condition/thread.condition.condvar/native_handle.pass.cpp b/libcxx/test/libcxx/thread/thread.condition/thread.condition.condvar/native_handle.pass.cpp
index 374aa2fd15350..13d1bfcb88126 100644
--- a/libcxx/test/libcxx/thread/thread.condition/thread.condition.condvar/native_handle.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.condition/thread.condition.condvar/native_handle.pass.cpp
@@ -17,8 +17,10 @@
 // typedef pthread_cond_t* native_handle_type;
 // native_handle_type native_handle();
 
-#include <condition_variable>
 #include <cassert>
+#include <condition_variable>
+#include <pthread.h>
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 92a84f59268bb..e52cf25b099ae 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -734,6 +734,14 @@ set stdexcept
 set tuple
 set type_traits
 set version
+shared_mutex cerrno
+shared_mutex cstddef
+shared_mutex ctime
+shared_mutex iosfwd
+shared_mutex limits
+shared_mutex ratio
+shared_mutex system_error
+shared_mutex type_traits
 shared_mutex version
 source_location cstdint
 source_location version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 833e8ac19f8e9..3f322fefe710f 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -735,6 +735,14 @@ set stdexcept
 set tuple
 set type_traits
 set version
+shared_mutex cerrno
+shared_mutex cstddef
+shared_mutex ctime
+shared_mutex iosfwd
+shared_mutex limits
+shared_mutex ratio
+shared_mutex system_error
+shared_mutex type_traits
 shared_mutex version
 source_location cstdint
 source_location version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index c3c0ae6f71e8d..5503cb3ed1bf9 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -737,6 +737,7 @@ set stdexcept
 set tuple
 set type_traits
 set version
+shared_mutex cerrno
 shared_mutex cstddef
 shared_mutex ctime
 shared_mutex iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index c3c0ae6f71e8d..5503cb3ed1bf9 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -737,6 +737,7 @@ set stdexcept
 set tuple
 set type_traits
 set version
+shared_mutex cerrno
 shared_mutex cstddef
 shared_mutex ctime
 shared_mutex iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index a5be0f14d8589..6985245378c82 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -743,6 +743,7 @@ set stdexcept
 set tuple
 set type_traits
 set version
+shared_mutex cerrno
 shared_mutex cstddef
 shared_mutex ctime
 shared_mutex iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx2b.csv b/libcxx/test/libcxx/transitive_includes/cxx2b.csv
index 8bed29fafc013..9701567c65835 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx2b.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx2b.csv
@@ -110,11 +110,8 @@ complex version
 concepts cstddef
 concepts version
 condition_variable cstddef
-condition_variable cstdint
 condition_variable cstdlib
-condition_variable cstring
 condition_variable ctime
-condition_variable initializer_list
 condition_variable iosfwd
 condition_variable limits
 condition_variable new
@@ -403,9 +400,7 @@ memory_resource version
 mutex cstddef
 mutex cstdint
 mutex cstdlib
-mutex cstring
 mutex ctime
-mutex initializer_list
 mutex iosfwd
 mutex limits
 mutex new
@@ -519,6 +514,7 @@ set optional
 set stdexcept
 set tuple
 set version
+shared_mutex cerrno
 shared_mutex cstddef
 shared_mutex ctime
 shared_mutex iosfwd
@@ -591,12 +587,9 @@ system_error string
 system_error version
 thread compare
 thread cstddef
-thread cstdint
-thread cstring
 thread ctime
 thread iosfwd
 thread limits
-thread new
 thread ratio
 thread system_error
 thread tuple
diff --git a/libcxx/test/std/thread/futures/futures.shared_future/wait.pass.cpp b/libcxx/test/std/thread/futures/futures.shared_future/wait.pass.cpp
index 12c71ab05e26e..5709e9d45df97 100644
--- a/libcxx/test/std/thread/futures/futures.shared_future/wait.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.shared_future/wait.pass.cpp
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <chrono>
 #include <future>
+#include <ratio>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/futures/futures.unique_future/wait.pass.cpp b/libcxx/test/std/thread/futures/futures.unique_future/wait.pass.cpp
index 2385156c3154b..4e6b789e45c26 100644
--- a/libcxx/test/std/thread/futures/futures.unique_future/wait.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.unique_future/wait.pass.cpp
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <chrono>
 #include <future>
+#include <ratio>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
index c5536d8f37e9c..962dcc03729f4 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
@@ -22,11 +22,12 @@
 // template<class _Mutex> shared_lock(shared_lock<_Mutex>)
 //     -> shared_lock<_Mutex>;  // C++17
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
 #include <vector>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_try_to_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_try_to_lock.pass.cpp
index fde0ed6b0b5b7..4527b6d8124bc 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_try_to_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_try_to_lock.pass.cpp
@@ -19,11 +19,13 @@
 
 // shared_lock(mutex_type& m, try_to_lock_t);
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
+#include <mutex>
 #include <shared_mutex>
 #include <thread>
 #include <vector>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
index 1bc131a7b968f..f7715168ee10d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
@@ -19,11 +19,13 @@
 
 // void lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
+#include <mutex>
 #include <shared_mutex>
 #include <thread>
 #include <vector>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
index ae387cdc8446d..0e707fcf2d50a 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
@@ -17,9 +17,10 @@
 
 // bool try_lock();
 
-#include <shared_mutex>
 #include <cassert>
 #include <mutex>
+#include <shared_mutex>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock_for.pass.cpp
index 7f8189c298577..d28ae395ccb0d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock_for.pass.cpp
@@ -16,10 +16,11 @@
 // template <class Rep, class Period>
 //   bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
 
-#include <shared_mutex>
 #include <cassert>
 #include <chrono>
 #include <mutex>
+#include <shared_mutex>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock_until.pass.cpp
index fb4afdd4d9101..880bf1cbd4999 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock_until.pass.cpp
@@ -16,10 +16,11 @@
 // template <class Clock, class Duration>
 //   bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
 
-#include <shared_mutex>
 #include <cassert>
 #include <chrono>
 #include <mutex>
+#include <shared_mutex>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
index bba78cf24178a..2be25748e903b 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
@@ -18,10 +18,11 @@
 // template<class _Mutex> unique_lock(unique_lock<_Mutex>)
 //     -> unique_lock<_Mutex>;  // C++17
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp
index f4f344ef9b2c3..992d383dfa780 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp
@@ -16,10 +16,11 @@
 
 // unique_lock(mutex_type& m, try_to_lock_t);
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
index fa43f5dd874a5..4aa6660449c99 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
@@ -16,10 +16,11 @@
 
 // void lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
index 9249959ad005a..4cf5ec2ab5ccf 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
@@ -16,8 +16,9 @@
 
 // bool try_lock();
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
index fe29d1625069e..8e7004e5eec85 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
@@ -16,8 +16,9 @@
 // template <class Rep, class Period>
 //   bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
index a1e8553b965fe..077bc517399ab 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
@@ -16,8 +16,9 @@
 // template <class Clock, class Duration>
 //   bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
index 4cd72b6bd8d2f..30c795150dace 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
@@ -14,8 +14,9 @@
 
 // void unlock();
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
index 6f91cc3d3ab11..b3e76cf886c4d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp
@@ -15,10 +15,11 @@
 
 // void lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
index 645b74bca920e..bf3cb6530b3b9 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp
@@ -15,10 +15,11 @@
 
 // bool try_lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/lock.pass.cpp
index 51b64b9aaddbb..d9bff9b3cbda5 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/lock.pass.cpp
@@ -15,10 +15,11 @@
 
 // void lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/try_lock.pass.cpp
index 801e2e738c5cb..1247c1ce1ba5f 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/try_lock.pass.cpp
@@ -15,10 +15,11 @@
 
 // bool try_lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
index 24a52ae69f5af..5d20951576a82 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
@@ -20,10 +20,11 @@
 
 // void lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
index 3f5a0642ab128..eca75f005ca55 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
@@ -20,11 +20,12 @@
 
 // void lock_shared();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
 #include <vector>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
index 92727eadbd9b3..bcbe7dfd78c1f 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
@@ -20,10 +20,11 @@
 
 // bool try_lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
index ac6a95bd87a52..5a54a7ecdd1f8 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
@@ -20,11 +20,12 @@
 
 // bool try_lock_shared();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
 #include <vector>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
index c4836a574e9dc..ffec5056f103f 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
@@ -20,12 +20,12 @@
 
 // void lock();
 
-#include <thread>
-
 #include <atomic>
-#include <cstdlib>
 #include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <shared_mutex>
+#include <thread>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
index 1ec0814e207ed..7187c95f2cd2d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
@@ -20,12 +20,13 @@
 
 // void lock_shared();
 
-#include <thread>
 
 #include <atomic>
 #include <cassert>
+#include <chrono>
 #include <cstdlib>
 #include <shared_mutex>
+#include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
index 20eda45677f4e..e454ae2214bc1 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
@@ -20,10 +20,11 @@
 
 // bool try_lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
index d1f37a9c42df6..9597218f36ecb 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
@@ -20,11 +20,12 @@
 
 // bool try_lock_shared();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
 #include <vector>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
index a2a91bc26a7dc..a71bd3d38b2c3 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp
@@ -15,10 +15,11 @@
 
 // void lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
index 02d0874c08069..f3942ccb9d860 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp
@@ -15,10 +15,11 @@
 
 // bool try_lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/lock.pass.cpp
index 91c6f1c064324..bad5a4457e516 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/lock.pass.cpp
@@ -15,10 +15,11 @@
 
 // void lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock.pass.cpp
index 5915698553f5a..63be0ac713f8b 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock.pass.cpp
@@ -15,10 +15,11 @@
 
 // bool try_lock();
 
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
 #include <mutex>
 #include <thread>
-#include <cstdlib>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.this/sleep_until.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.this/sleep_until.pass.cpp
index 3b4ae203d2c3b..7a080651da393 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.this/sleep_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.this/sleep_until.pass.cpp
@@ -13,9 +13,10 @@
 // template <class Clock, class Duration>
 //   void sleep_until(const chrono::time_point<Clock, Duration>& abs_time);
 
-#include <thread>
-#include <cstdlib>
 #include <cassert>
+#include <chrono>
+#include <cstdlib>
+#include <thread>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
index 0c2f6852ecc27..e05146686434a 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
@@ -31,6 +31,7 @@
 
 #include <cassert>
 #include <concepts>
+#include <ratio>
 #include <sstream>
 
 #include "make_string.h"
diff --git a/libcxx/test/std/time/time.hms/time.hms.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.hms/time.hms.nonmembers/ostream.pass.cpp
index 5bace05949e87..c639ea8dcec42 100644
--- a/libcxx/test/std/time/time.hms/time.hms.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.hms/time.hms.nonmembers/ostream.pass.cpp
@@ -27,8 +27,9 @@
 //   basic_ostream<charT, traits>&
 //     operator<<(basic_ostream<charT, traits>& os, const hh_mm_ss<Duration>& hms);
 
-#include <chrono>
 #include <cassert>
+#include <chrono>
+#include <ratio>
 #include <sstream>
 
 #include "make_string.h"
diff --git a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
index 1a2fccacb44d7..3ca7ae2b40c91 100644
--- a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
@@ -31,6 +31,7 @@
 #include <concepts>
 #include <locale>
 #include <iostream>
+#include <ratio>
 #include <type_traits>
 
 #include "formatter_tests.h"
diff --git a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
index 1bbe72a8e285e..2fbca0e1aece3 100644
--- a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
@@ -32,6 +32,7 @@
 #include <concepts>
 #include <locale>
 #include <iostream>
+#include <ratio>
 #include <type_traits>
 
 #include "formatter_tests.h"
diff --git a/libcxx/utils/data/ignore_format.txt b/libcxx/utils/data/ignore_format.txt
index a6838ccb2b022..a89361cbebc6a 100644
--- a/libcxx/utils/data/ignore_format.txt
+++ b/libcxx/utils/data/ignore_format.txt
@@ -489,7 +489,6 @@ libcxx/include/__memory/uninitialized_algorithms.h
 libcxx/include/__memory/unique_ptr.h
 libcxx/include/__memory/uses_allocator.h
 libcxx/include/mutex
-libcxx/include/__mutex_base
 libcxx/include/new
 libcxx/include/__node_handle
 libcxx/include/numbers

From 0766c1bd5c0ece916be14b620b02bea845cb9fac Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 22 Mar 2023 12:07:08 -0700
Subject: [PATCH 032/208] [LFTR] Simplify integer case for genLoopLimit
 [nfc-ish]

The integer case in genLoopLimit reduces down to a special case for narrowing the bitwidth of the limit, and then performing the same expansion we would for a pointer IV.

Differential Revision: https://reviews.llvm.org/D146638
---
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 69 +++++--------------
 1 file changed, 18 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 43f3beb4b34b5..28e6794389538 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -910,60 +910,27 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
   assert(isLoopCounter(IndVar, L, SE));
   assert(ExitCount->getType()->isIntegerTy() && "exit count must be integer");
   const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
-  const SCEV *IVInit = AR->getStart();
   assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
 
-  // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter
-  // finds a valid pointer IV.
-  if (IndVar->getType()->isPointerTy()) {
-    const SCEVAddRecExpr *ARBase = UsePostInc ? AR->getPostIncExpr(*SE) : AR;
-    const SCEV *IVLimit = ARBase->evaluateAtIteration(ExitCount, *SE);
-    assert(SE->isLoopInvariant(IVLimit, L) &&
-           "Computed iteration count is not loop invariant!");
-    return Rewriter.expandCodeFor(IVLimit, IndVar->getType(),
-                                  ExitingBB->getTerminator());
-  } else {
-    // In any other case, convert both IVInit and ExitCount to integers before
-    // comparing. This may result in SCEV expansion of pointers, but in practice
-    // SCEV will fold the pointer arithmetic away as such:
-    // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
-    //
-    // Valid Cases: (1) both integers is most common; (2) both may be pointers
-    // for simple memset-style loops.
-    //
-    // IVInit integer and ExitCount pointer would only occur if a canonical IV
-    // were generated on top of case #2, which is not expected.
-
-    // For unit stride, IVCount = Start + ExitCount with 2's complement
-    // overflow.
-
-    // For integer IVs, truncate the IV before computing IVInit + BECount,
-    // unless we know apriori that the limit must be a constant when evaluated
-    // in the bitwidth of the IV.  We prefer (potentially) keeping a truncate
-    // of the IV in the loop over a (potentially) expensive expansion of the
-    // widened exit count add(zext(add)) expression.
-    if (SE->getTypeSizeInBits(IVInit->getType())
-        > SE->getTypeSizeInBits(ExitCount->getType())) {
-      if (isa<SCEVConstant>(IVInit) && isa<SCEVConstant>(ExitCount))
-        ExitCount = SE->getZeroExtendExpr(ExitCount, IVInit->getType());
-      else
-        IVInit = SE->getTruncateExpr(IVInit, ExitCount->getType());
-    }
-
-    const SCEV *IVLimit = SE->getAddExpr(IVInit, ExitCount);
-
-    if (UsePostInc)
-      IVLimit = SE->getAddExpr(IVLimit, SE->getOne(IVLimit->getType()));
-
-    // Expand the code for the iteration count.
-    assert(SE->isLoopInvariant(IVLimit, L) &&
-           "Computed iteration count is not loop invariant!");
-    // Ensure that we generate the same type as IndVar, or a smaller integer
-    // type. In the presence of null pointer values, we have an integer type
-    // SCEV expression (IVInit) for a pointer type IV value (IndVar).
-    return Rewriter.expandCodeFor(IVLimit, ExitCount->getType(),
-                                  ExitingBB->getTerminator());
+  // For integer IVs, truncate the IV before computing the limit unless we
+  // know apriori that the limit must be a constant when evaluated in the
+  // bitwidth of the IV.  We prefer (potentially) keeping a truncate of the
+  // IV in the loop over a (potentially) expensive expansion of the widened
+  // exit count add(zext(add)) expression.
+  if (IndVar->getType()->isIntegerTy() &&
+      SE->getTypeSizeInBits(AR->getType()) >
+      SE->getTypeSizeInBits(ExitCount->getType())) {
+    const SCEV *IVInit = AR->getStart();
+    if (!isa<SCEVConstant>(IVInit) || !isa<SCEVConstant>(ExitCount))
+      AR = cast<SCEVAddRecExpr>(SE->getTruncateExpr(AR, ExitCount->getType()));
   }
+
+  const SCEVAddRecExpr *ARBase = UsePostInc ? AR->getPostIncExpr(*SE) : AR;
+  const SCEV *IVLimit = ARBase->evaluateAtIteration(ExitCount, *SE);
+  assert(SE->isLoopInvariant(IVLimit, L) &&
+         "Computed iteration count is not loop invariant!");
+  return Rewriter.expandCodeFor(IVLimit, ARBase->getType(),
+                                ExitingBB->getTerminator());
 }
 
 /// This method rewrites the exit condition of the loop to be a canonical !=

From 08622314d2a23101536ca744f0092eaec63e14e8 Mon Sep 17 00:00:00 2001
From: Jeff Byrnes <jeffrey.byrnes@amd.com>
Date: Wed, 22 Mar 2023 12:19:00 -0700
Subject: [PATCH 033/208] Precommit tests for D146327

---
 .../propagate-remove-dead-args.ll             | 66 +++++++++++++++++
 .../dce-after-argument-promotion-loads.ll     | 72 +++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll

diff --git a/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll b/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll
new file mode 100644
index 0000000000000..cc1f7fb26a479
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
+
+%ptr.struct = type { ptr, ptr, ptr }
+
+define internal void @child(ptr %this, ptr %y, ptr %x) {
+; CHECK-LABEL: define internal void @child
+; CHECK-SAME: (ptr [[Y:%.*]], half [[X_0_VAL:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store half [[X_0_VAL]], ptr [[Y]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load half, ptr %x
+  store half %0, ptr %y
+  ret void
+}
+
+define internal void @parent(ptr %this, ptr %p1, ptr %p2) {
+; CHECK-LABEL: define internal void @parent
+; CHECK-SAME: (ptr [[THIS:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_ELEMENT_OP_0:%.*]] = getelementptr ptr, ptr [[THIS]], i64 0
+; CHECK-NEXT:    [[LOAD0:%.*]] = load ptr, ptr [[SRC_ELEMENT_OP_0]], align 8
+; CHECK-NEXT:    [[P2_VAL2:%.*]] = load half, ptr [[P2]], align 2
+; CHECK-NEXT:    call void @child(ptr [[P1]], half [[P2_VAL2]])
+; CHECK-NEXT:    [[SRC_ELEMENT_OP_1:%.*]] = getelementptr ptr, ptr [[THIS]], i64 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[SRC_ELEMENT_OP_1]], align 8
+; CHECK-NEXT:    [[P2_VAL1:%.*]] = load half, ptr [[P2]], align 2
+; CHECK-NEXT:    call void @child(ptr [[P1]], half [[P2_VAL1]])
+; CHECK-NEXT:    [[SRC_ELEMENT_OP_2:%.*]] = getelementptr ptr, ptr [[THIS]], i64 2
+; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[SRC_ELEMENT_OP_2]], align 8
+; CHECK-NEXT:    [[P2_VAL:%.*]] = load half, ptr [[P2]], align 2
+; CHECK-NEXT:    call void @child(ptr [[P1]], half [[P2_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src_element_op_0 = getelementptr ptr, ptr %this, i64 0
+  %load0 = load ptr, ptr %src_element_op_0
+  call void @child(ptr %load0, ptr %p1, ptr %p2)
+  %src_element_op_1 = getelementptr ptr, ptr %this, i64 1
+  %load1 = load ptr, ptr %src_element_op_1
+  call void @child(ptr %load1, ptr %p1, ptr %p2)
+  %src_element_op_2 = getelementptr ptr, ptr %this, i64 2
+  %load2 = load ptr, ptr %src_element_op_2
+  call void @child(ptr %load2, ptr %p1, ptr %p2)
+  ret void
+}
+
+define  void @grandparent() {
+; CHECK-LABEL: define void @grandparent() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[F:%.*]] = alloca [[PTR_STRUCT:%.*]], align 8
+; CHECK-NEXT:    [[XPTR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[YPTR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @parent(ptr [[F]], ptr [[XPTR]], ptr [[YPTR]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %f = alloca %ptr.struct
+  %xptr = alloca i32
+  %yptr = alloca i32
+  call void @parent(ptr %f, ptr %xptr, ptr %yptr)
+  ret void
+}
+
diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll
new file mode 100644
index 0000000000000..2bdd42b3dd8ca
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -O3 -S < %s | FileCheck %s
+
+; Arg promotion eliminates the struct argument, and eliminates dead arguments, but introduces and leaves dead loads of the eliminated dead arg in callers
+
+%struct.ss = type { ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+
+define internal void @phantomLoad(ptr %p, ptr %y, ptr %x) {
+entry:
+  %0 = load i32, ptr %x
+  store i32 %0, ptr %y
+  ret void
+}
+
+define ptr @parent(ptr align 8 dereferenceable(72) %f, i16 %val1, i16 %val2, i32 %val3) align 2 {
+; CHECK-LABEL: define {{[^@]+}}@parent
+; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT_NOT_I:%.*]] = icmp eq i32 [[VAL3]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[F]], i64 0, i32 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-NEXT:    br i1 [[CMP_NOT_NOT_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I:%.*]]
+; CHECK:       if.then.i:
+; CHECK-NEXT:    store i16 [[VAL1]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    br label [[BADCHILD_EXIT:%.*]]
+; CHECK:       if.else.i:
+; CHECK-NEXT:    [[ADD_PTR_I_I_I_I7_I:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    store i16 [[VAL1]], ptr [[ADD_PTR_I_I_I_I7_I]], align 2
+; CHECK-NEXT:    br label [[BADCHILD_EXIT]]
+; CHECK:       badChild.exit:
+; CHECK-NEXT:    [[DOTSINK_I:%.*]] = phi ptr [ [[TMP1]], [[IF_ELSE_I]] ], [ [[ADD_PTR_I_I_I_I_I]], [[IF_THEN_I]] ]
+; CHECK-NEXT:    store i16 [[VAL2]], ptr [[DOTSINK_I]], align 2
+; CHECK-NEXT:    ret ptr [[F]]
+;
+entry:
+  call void @badChild(ptr align 8 dereferenceable(72) %f, i16 %val1, i16 %val2, i32 %val3) #4
+  ret ptr %f
+}
+
+define internal void @badChild(ptr align 8 dereferenceable(72) %this, i16 %val1, i16 %val2, i32 %val3) align 2 {
+entry:
+  %othergep = getelementptr inbounds %struct.ss, ptr %this, i64 0, i32 2
+  %load0 = load ptr, ptr %othergep, align 8
+  %load2 = load ptr, ptr %this
+  %x = alloca i32
+  %y = alloca i32
+  call void @phantomLoad(ptr %load0, ptr %x, ptr %y)
+  call void @phantomLoad(ptr %load2, ptr %x, ptr %y)
+  %cmp.not.not = icmp eq i32 %val3, 0
+  br i1 %cmp.not.not, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %0 = getelementptr inbounds %struct.ss, ptr %this, i64 0, i32 8
+  %1 = load ptr, ptr %0, align 8
+  store i16 %val1, ptr %1, align 2
+  %add.ptr.i.i.i.i = getelementptr inbounds i8, ptr %1, i64 16
+  store i16 %val2, ptr %add.ptr.i.i.i.i, align 2
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %2 = getelementptr inbounds %struct.ss, ptr %this, i64 0, i32 8
+  %3 = load ptr, ptr %2, align 8
+  %add.ptr.i.i.i.i7 = getelementptr inbounds i8, ptr %3, i64 16
+  store i16 %val1, ptr %add.ptr.i.i.i.i7, align 2
+  store i16 %val2, ptr %3, align 2
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+

From 587b3713309b03f73d2affba8a9a992a70aff174 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 22 Mar 2023 19:30:55 +0000
Subject: [PATCH 034/208] [gn build] Port e655d8a54880

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 764f12ce21ec2..bab71dfafae19 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -369,6 +369,7 @@ if (current_toolchain == default_toolchain) {
       "__concepts/semiregular.h",
       "__concepts/swappable.h",
       "__concepts/totally_ordered.h",
+      "__condition_variable/condition_variable.h",
       "__config",
       "__coroutine/coroutine_handle.h",
       "__coroutine/coroutine_traits.h",
@@ -549,6 +550,10 @@ if (current_toolchain == default_toolchain) {
       "__memory_resource/pool_options.h",
       "__memory_resource/synchronized_pool_resource.h",
       "__memory_resource/unsynchronized_pool_resource.h",
+      "__mutex/lock_guard.h",
+      "__mutex/mutex.h",
+      "__mutex/tag_types.h",
+      "__mutex/unique_lock.h",
       "__mutex_base",
       "__node_handle",
       "__numeric/accumulate.h",

From cf8dc9dfe9eb66621cb7c860bf81b29699415c66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Wed, 22 Mar 2023 20:21:59 +0100
Subject: [PATCH 035/208] [JITLink] Introduce target flags for Symbol and
 prepare ObjectLinkingLayer to account for them

AArch32 branch offsets explicitly encode the target instruction subset (Arm/Thumb) in their least significant bit. We want this bit set (or clear) in addreses we hand out, but the addresses in the LinkGraph should be the real/physical addresses.

This patch allows ELFLinkGraphBuilder's to set target-specific flags in jitlink::Symbol and prepares ObjectLinkingLayer to account for them.

Reviewed By: lhames

Differential Revision: https://reviews.llvm.org/D146641
---
 .../llvm/ExecutionEngine/JITLink/JITLink.h    | 20 +++++++++++++---
 .../JITLink/ELFLinkGraphBuilder.h             | 23 ++++++++++++++++---
 .../Orc/ObjectLinkingLayer.cpp                | 13 +++++++----
 3 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 2b6696c7fdffd..3bc9bebea6e0b 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -367,13 +367,15 @@ inline orc::ExecutorAddr alignToBlock(orc::ExecutorAddr Addr, Block &B) {
 // must end with a zero, and contain no zeros before the end.
 bool isCStringBlock(Block &B);
 
-/// Describes symbol linkage. This can be used to make resolve definition
-/// clashes.
+/// Describes symbol linkage. This can be used to resolve definition clashes.
 enum class Linkage : uint8_t {
   Strong,
   Weak,
 };
 
+/// Holds target-specific properties for a symbol.
+using TargetFlagsType = uint8_t;
+
 /// For errors and debugging output.
 const char *getLinkageName(Linkage L);
 
@@ -611,6 +613,17 @@ class Symbol {
     this->S = static_cast<uint8_t>(S);
   }
 
+  /// Check wehther the given target flags are set for this Symbol.
+  bool hasTargetFlags(TargetFlagsType Flags) const {
+    return static_cast<TargetFlagsType>(TargetFlags) & Flags;
+  }
+
+  /// Set the target flags for this Symbol.
+  void setTargetFlags(TargetFlagsType Flags) {
+    assert(Flags <= 1 && "Add more bits to store more than single flag");
+    TargetFlags = Flags;
+  }
+
   /// Returns true if this is a weakly referenced external symbol.
   /// This method may only be called on external symbols.
   bool isWeaklyReferenced() const {
@@ -655,12 +668,13 @@ class Symbol {
   // FIXME: A char* or SymbolStringPtr may pack better.
   StringRef Name;
   Addressable *Base = nullptr;
-  uint64_t Offset : 58;
+  uint64_t Offset : 57;
   uint64_t L : 1;
   uint64_t S : 2;
   uint64_t IsLive : 1;
   uint64_t IsCallable : 1;
   uint64_t WeakRef : 1;
+  uint64_t TargetFlags : 1;
   size_t Size = 0;
 };
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index 26feb8ea3277b..9d2d4958dcf6c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -112,6 +112,17 @@ class ELFLinkGraphBuilder : public ELFLinkGraphBuilderBase {
   Expected<std::pair<Linkage, Scope>>
   getSymbolLinkageAndScope(const typename ELFT::Sym &Sym, StringRef Name);
 
+  /// Set the target flags on the given Symbol.
+  virtual TargetFlagsType makeTargetFlags(const typename ELFT::Sym &Sym) {
+    return TargetFlagsType{};
+  }
+
+  /// Get the physical offset of the symbol on the target platform.
+  virtual orc::ExecutorAddrDiff getRawOffset(const typename ELFT::Sym &Sym,
+                                             TargetFlagsType Flags) {
+    return Sym.getValue();
+  }
+
   Error prepare();
   Error graphifySections();
   Error graphifySymbols();
@@ -478,6 +489,9 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() {
                  << "\"\n";
         });
 
+        TargetFlagsType Flags = makeTargetFlags(Sym);
+        orc::ExecutorAddrDiff Offset = getRawOffset(Sym, Flags);
+
         // In RISCV, temporary symbols (Used to generate dwarf, eh_frame
         // sections...) will appear in object code's symbol table, and LLVM does
         // not use names on these temporary symbols (RISCV gnu toolchain uses
@@ -485,10 +499,13 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() {
         // anonymous symbol.
         auto &GSym =
             Name->empty()
-                ? G->addAnonymousSymbol(*B, Sym.getValue(), Sym.st_size,
+                ? G->addAnonymousSymbol(*B, Offset, Sym.st_size,
                                         false, false)
-                : G->addDefinedSymbol(*B, Sym.getValue(), *Name, Sym.st_size, L,
-                                      S, Sym.getType() == ELF::STT_FUNC, false);
+                : G->addDefinedSymbol(*B, Offset, *Name, Sym.st_size, L,
+                                      S, Sym.getType() == ELF::STT_FUNC,
+                                      false);
+
+        GSym.setTargetFlags(Flags);
         setGraphSymbol(SymIndex, GSym);
       }
     } else if (Sym.isUndefined() && Sym.isExternal()) {
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index e7356e5d3151e..2c270cd66285d 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -39,6 +39,10 @@ bool hasInitializerSection(jitlink::LinkGraph &G) {
   return false;
 }
 
+JITTargetAddress getJITSymbolPtrForSymbol(Symbol &Sym) {
+  return Sym.getAddress().getValue();
+}
+
 JITSymbolFlags getJITSymbolFlagsForSymbol(Symbol &Sym) {
   JITSymbolFlags Flags;
 
@@ -215,10 +219,9 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
     for (auto *Sym : G.defined_symbols())
       if (Sym->hasName() && Sym->getScope() != Scope::Local) {
         auto InternedName = ES.intern(Sym->getName());
+        auto Ptr = getJITSymbolPtrForSymbol(*Sym);
         auto Flags = getJITSymbolFlagsForSymbol(*Sym);
-
-        InternedResult[InternedName] =
-            JITEvaluatedSymbol(Sym->getAddress().getValue(), Flags);
+        InternedResult[InternedName] = JITEvaluatedSymbol(Ptr, Flags);
         if (AutoClaim && !MR->getSymbols().count(InternedName)) {
           assert(!ExtraSymbolsToClaim.count(InternedName) &&
                  "Duplicate symbol to claim?");
@@ -229,9 +232,9 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
     for (auto *Sym : G.absolute_symbols())
       if (Sym->hasName() && Sym->getScope() != Scope::Local) {
         auto InternedName = ES.intern(Sym->getName());
+        auto Ptr = getJITSymbolPtrForSymbol(*Sym);
         auto Flags = getJITSymbolFlagsForSymbol(*Sym);
-        InternedResult[InternedName] =
-            JITEvaluatedSymbol(Sym->getAddress().getValue(), Flags);
+        InternedResult[InternedName] = JITEvaluatedSymbol(Ptr, Flags);
         if (AutoClaim && !MR->getSymbols().count(InternedName)) {
           assert(!ExtraSymbolsToClaim.count(InternedName) &&
                  "Duplicate symbol to claim?");

From 77ed8311a625f449e7ee8bebda3b2940be6dc211 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 22 Mar 2023 12:56:13 -0700
Subject: [PATCH 036/208] [test] Add tools/llvm-dwarfdump/ARM/lit.local.cfg
 after D143513

---
 llvm/test/tools/llvm-dwarfdump/ARM/lit.local.cfg | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 llvm/test/tools/llvm-dwarfdump/ARM/lit.local.cfg

diff --git a/llvm/test/tools/llvm-dwarfdump/ARM/lit.local.cfg b/llvm/test/tools/llvm-dwarfdump/ARM/lit.local.cfg
new file mode 100644
index 0000000000000..236e1d3441665
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True

From 3d334df58742ff53fb00aa3caeb7eb5da3436348 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 16 Mar 2023 13:09:44 -0400
Subject: [PATCH 037/208] [libc++] Remove availability markup for std::format

std::format is currently experimental, so there is technically no
deployment target requirement for it (since the only symbols required
for it are in `libc++experimental.a`).

However, some parts of std::format depend indirectly on the floating
point std::to_chars implementation, which does have deployment target
requirements.

This patch removes all the availability format for std::format and
updates the XFAILs in the tests to properly explain why they fail
on old deployment targets, when they do. It also changes a couple
of tests to avoid depending on floating-point std::to_chars when
it isn't fundamental to the test.

Finally, some tests are marked as XFAIL but I added a comment saying

   TODO FMT This test should not require std::to_chars(floating-point)

These tests do not fundamentally depend on floating-point std::to_chars,
however they end up failing because calling std::format even without a
floating-point argument to format will end up requiring floating-point
std::to_chars. I believe this is an implementation artifact that could
be avoided in all cases where we know the format string at compile-time.
In the tests, I added the TODO comment only to the places where we could
do better and actually avoid relying on floating-point std::to_chars
because we know the format string at compile-time.

Differential Revision: https://reviews.llvm.org/D134598
---
 libcxx/include/__availability                 | 11 ----
 libcxx/include/__chrono/formatter.h           | 32 +++++-----
 libcxx/include/__chrono/ostream.h             | 35 +++++------
 libcxx/include/__format/container_adaptor.h   |  8 +--
 libcxx/include/__format/format_arg.h          |  6 +-
 libcxx/include/__format/format_args.h         |  2 +-
 libcxx/include/__format/format_context.h      |  5 +-
 libcxx/include/__format/format_functions.h    | 62 +++++++++++--------
 libcxx/include/__format/format_fwd.h          |  6 +-
 .../include/__format/format_parse_context.h   |  2 +-
 libcxx/include/__format/formatter.h           |  2 +-
 libcxx/include/__format/formatter_bool.h      |  2 +-
 libcxx/include/__format/formatter_char.h      |  8 +--
 .../__format/formatter_floating_point.h       |  6 +-
 libcxx/include/__format/formatter_integer.h   | 26 ++++----
 libcxx/include/__format/formatter_pointer.h   |  6 +-
 libcxx/include/__format/formatter_string.h    | 10 +--
 libcxx/include/__format/formatter_tuple.h     |  6 +-
 .../__format/range_default_formatter.h        | 13 ++--
 libcxx/include/__format/range_formatter.h     |  2 +-
 libcxx/include/vector                         |  2 +-
 .../format.functions.format.pass.cpp          |  5 +-
 .../format.functions.vformat.pass.cpp         |  5 +-
 .../container.adaptors.format/format.pass.cpp |  4 --
 .../container.adaptors.format/parse.pass.cpp  |  4 --
 .../format.functions.format.pass.cpp          |  4 ++
 .../format.functions.vformat.pass.cpp         |  5 +-
 .../vector.bool.fmt/format.pass.cpp           |  4 --
 .../vector.bool.fmt/parse.pass.cpp            |  4 --
 .../time.cal.day.nonmembers/ostream.pass.cpp  |  4 ++
 .../time.cal.md.nonmembers/ostream.pass.cpp   |  4 ++
 .../time.cal/time.cal.mdlast/ostream.pass.cpp |  4 ++
 .../ostream.pass.cpp                          |  4 ++
 .../time.cal.mwd.nonmembers/ostream.pass.cpp  |  4 ++
 .../ostream.pass.cpp                          |  4 ++
 .../ostream.pass.cpp                          |  4 ++
 .../ostream.pass.cpp                          |  4 ++
 .../ostream.pass.cpp                          |  4 ++
 .../time.cal.year.nonmembers/ostream.pass.cpp |  4 ++
 .../time.cal.ym.nonmembers/ostream.pass.cpp   |  4 ++
 .../time.cal.ymd.nonmembers/ostream.pass.cpp  |  4 ++
 .../ostream.pass.cpp                          |  4 ++
 .../time.cal.ymwd.nonmembers/ostream.pass.cpp |  4 ++
 .../ostream.pass.cpp                          |  4 ++
 .../time.duration.nonmember/ostream.pass.cpp  |  4 ++
 .../time.hms.nonmembers/ostream.pass.cpp      |  3 +
 .../std/time/time.syn/formatter.day.pass.cpp  |  4 ++
 .../time/time.syn/formatter.duration.pass.cpp |  3 +
 .../time/time.syn/formatter.hh_mm_ss.pass.cpp |  3 +
 .../time/time.syn/formatter.month.pass.cpp    |  4 ++
 .../time.syn/formatter.month_day.pass.cpp     |  4 ++
 .../formatter.month_day_last.pass.cpp         |  4 ++
 .../time.syn/formatter.month_weekday.pass.cpp |  4 ++
 .../time/time.syn/formatter.weekday.pass.cpp  |  4 ++
 .../time.syn/formatter.weekday_index.pass.cpp |  4 ++
 .../time.syn/formatter.weekday_last.pass.cpp  |  4 ++
 .../std/time/time.syn/formatter.year.pass.cpp |  4 ++
 .../time.syn/formatter.year_month.pass.cpp    |  4 ++
 .../formatter.year_month_day.pass.cpp         |  4 ++
 .../formatter.year_month_day_last.pass.cpp    |  4 ++
 .../formatter.year_month_weekday.pass.cpp     |  4 ++
 ...formatter.year_month_weekday_last.pass.cpp |  4 ++
 .../make_format_args.pass.cpp                 |  4 +-
 .../make_wformat_args.pass.cpp                |  4 +-
 .../format.arg/visit_format_arg.pass.cpp      |  3 -
 .../format.arguments/format.args/get.pass.cpp |  3 -
 .../format/format.error/format.error.pass.cpp |  3 -
 .../format/format.fmt.string/ctor.verify.cpp  |  4 +-
 .../format/format.fmt.string/get.pass.cpp     |  4 +-
 .../format.fmt.string/types.compile.pass.cpp  |  9 ++-
 .../concept.formattable.compile.pass.cpp      |  7 ++-
 ...concept.formattable.float.compile.pass.cpp | 58 +++++++++++++++++
 .../formatter.floating_point.pass.cpp         |  3 +
 .../format.parse.ctx/check_arg_id.pass.cpp    |  3 -
 .../format.parse.ctx/next_arg_id.pass.cpp     |  3 -
 .../format/format.functions/P2418.pass.cpp    |  4 ++
 .../format/format.functions/ascii.pass.cpp    |  4 ++
 .../escaped_output.ascii.pass.cpp             |  4 ++
 .../escaped_output.unicode.pass.cpp           |  4 ++
 .../format.functions/format.locale.pass.cpp   |  3 +
 .../format.functions/format.locale.verify.cpp |  4 ++
 .../format/format.functions/format.pass.cpp   |  3 +
 .../format/format.functions/format.verify.cpp |  4 ++
 .../format_to.locale.pass.cpp                 |  3 +
 .../format_to.locale.verify.cpp               |  4 ++
 .../format.functions/format_to.pass.cpp       |  3 +
 .../format.functions/format_to.verify.cpp     |  4 ++
 .../format_to_n.locale.pass.cpp               |  3 +
 .../format_to_n.locale.verify.cpp             |  4 ++
 .../format.functions/format_to_n.pass.cpp     |  3 +
 .../format.functions/format_to_n.verify.cpp   |  4 ++
 .../formatted_size.locale.pass.cpp            |  3 +
 .../formatted_size.locale.verify.cpp          |  4 ++
 .../format.functions/formatted_size.pass.cpp  |  3 +
 .../formatted_size.verify.cpp                 |  4 ++
 .../locale-specific_form.pass.cpp             |  3 +
 .../format/format.functions/unicode.pass.cpp  |  4 ++
 .../format.functions/vformat.locale.pass.cpp  |  3 +
 .../format/format.functions/vformat.pass.cpp  |  3 +
 .../vformat_to.locale.pass.cpp                |  3 +
 .../format.functions/vformat_to.pass.cpp      |  3 +
 .../format.range.fmtdef/format.pass.cpp       |  4 --
 .../format.range.fmtdef/parse.pass.cpp        |  4 --
 .../format.range.fmtdef/set_brackets.pass.cpp |  4 --
 .../set_separator.pass.cpp                    |  4 --
 .../format_kind.compile.pass.cpp              |  3 +
 .../format.functions.format.pass.cpp          |  5 +-
 .../format.functions.vformat.pass.cpp         |  5 +-
 .../format.range.fmtmap/format.pass.cpp       |  4 --
 .../format.range.fmtmap/parse.pass.cpp        |  4 --
 .../format.functions.format.pass.cpp          |  5 +-
 .../format.functions.vformat.pass.cpp         |  5 +-
 .../format.range.fmtset/format.pass.cpp       |  4 --
 .../format.range.fmtset/parse.pass.cpp        |  4 --
 .../format.functions.format.pass.cpp          |  5 +-
 .../format.functions.vformat.pass.cpp         |  5 +-
 .../format.range.formatter/format.pass.cpp    |  4 --
 .../format.range.formatter/parse.pass.cpp     |  4 --
 .../set_brackets.pass.cpp                     |  4 --
 .../set_separator.pass.cpp                    |  4 --
 .../underlying.pass.cpp                       |  4 --
 .../format.functions.format.pass.cpp          |  6 +-
 .../format.functions.format.verify.cpp        |  4 ++
 .../format.functions.vformat.pass.cpp         |  5 +-
 .../format/format.tuple/format.pass.cpp       |  6 +-
 .../format/format.tuple/parse.pass.cpp        |  6 +-
 .../format/format.tuple/set_brackets.pass.cpp |  6 +-
 .../format.tuple/set_separator.pass.cpp       |  6 +-
 libcxx/utils/ci/run-buildbot                  |  6 --
 129 files changed, 460 insertions(+), 285 deletions(-)
 create mode 100644 libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp

diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index c03d373cafb51..5978dabdacb5f 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -149,13 +149,6 @@
 // #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
 // #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
 
-    // This controls the availability of the C++20 format library.
-    // The library is in development and not ABI stable yet. P2216 is
-    // retroactively accepted in C++20. This paper contains ABI breaking
-    // changes.
-#   define _LIBCPP_AVAILABILITY_FORMAT
-// #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_format
-
     // This controls whether the library claims to provide a default verbose
     // termination function, and consequently whether the headers will try
     // to use it when the mechanism isn't overriden at compile-time.
@@ -259,10 +252,6 @@
 #       define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
 #   endif
 
-#   define _LIBCPP_AVAILABILITY_FORMAT                                          \
-        __attribute__((unavailable))
-#   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_format
-
 #   define _LIBCPP_HAS_NO_VERBOSE_ABORT_IN_LIBRARY
 
 #else
diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h
index ee7cf93fc79b9..f6e0f9602fdd9 100644
--- a/libcxx/include/__chrono/formatter.h
+++ b/libcxx/include/__chrono/formatter.h
@@ -542,7 +542,7 @@ __format_chrono(const _Tp& __value,
 } // namespace __formatter
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __formatter_chrono {
+struct _LIBCPP_TEMPLATE_VIS __formatter_chrono {
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr auto __parse(
       basic_format_parse_context<_CharT>& __parse_ctx, __format_spec::__fields __fields, __format_spec::__flags __flags)
@@ -582,7 +582,7 @@ struct formatter<chrono::duration<_Rep, _Period>, _CharT> : public __formatter_c
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::day, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::day, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -594,7 +594,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::day, _
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -606,7 +606,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month,
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -618,7 +618,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year,
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::weekday, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -630,7 +630,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::weekda
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::weekday_indexed, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday_indexed, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -642,7 +642,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::weekda
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::weekday_last, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday_last, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -654,7 +654,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::weekda
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month_day, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_day, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -666,7 +666,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month_
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month_day_last, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_day_last, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -678,7 +678,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month_
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month_weekday, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_weekday, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -690,7 +690,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month_
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month_weekday_last, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_weekday_last, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -702,7 +702,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::month_
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_month, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -714,7 +714,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_m
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_month_day, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_day, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -726,7 +726,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_m
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_month_day_last, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_day_last, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -738,7 +738,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_m
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_month_weekday, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_weekday, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
@@ -750,7 +750,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_m
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<chrono::year_month_weekday_last, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_weekday_last, _CharT>
     : public __formatter_chrono<_CharT> {
 public:
   using _Base = __formatter_chrono<_CharT>;
diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h
index 23feb9d711303..2f34115c729a3 100644
--- a/libcxx/include/__chrono/ostream.h
+++ b/libcxx/include/__chrono/ostream.h
@@ -93,7 +93,7 @@ _LIBCPP_HIDE_FROM_ABI auto __units_suffix() {
 }
 
 template <class _CharT, class _Traits, class _Rep, class _Period>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const duration<_Rep, _Period>& __d) {
   basic_ostringstream<_CharT, _Traits> __s;
   __s.flags(__os.flags());
@@ -104,8 +104,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const duration<_Rep, _Period>&
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const day& __d) {
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, const day& __d) {
   return __os << (__d.ok() ? std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "{:%d}"), __d)
                            // Note this error differs from the wording of the Standard. The
                            // Standard wording doesn't work well on AIX or Windows. There
@@ -117,7 +116,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const day& __d) {
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const month& __m) {
   return __os << (__m.ok() ? std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%b}"), __m)
                            : std::format(__os.getloc(),
@@ -126,14 +125,14 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const month& __m) {
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const year& __y) {
   return __os << (__y.ok() ? std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "{:%Y}"), __y)
                            : std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "{:%Y} is not a valid year"), __y));
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const weekday& __wd) {
   return __os << (__wd.ok() ? std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%a}"), __wd)
                             : std::format(__os.getloc(), // TODO FMT Standard mandated locale isn't used.
@@ -142,7 +141,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const weekday& __wd) {
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const weekday_indexed& __wdi) {
   auto __i = __wdi.index();
   return __os << (__i >= 1 && __i <= 5
@@ -154,13 +153,13 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const weekday_indexed& __wdi) {
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const weekday_last& __wdl) {
   return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L}[last]"), __wdl.weekday());
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const month_day& __md) {
   // TODO FMT The Standard allows 30th of February to be printed.
   // It would be nice to show an error message instead.
@@ -168,47 +167,47 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const month_day& __md) {
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const month_day_last& __mdl) {
   return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L}/last"), __mdl.month());
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const month_weekday& __mwd) {
   return __os << std::format(
              __os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L}/{:L}"), __mwd.month(), __mwd.weekday_indexed());
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const month_weekday_last& __mwdl) {
   return __os << std::format(
              __os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L}/{:L}"), __mwdl.month(), __mwdl.weekday_last());
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const year_month& __ym) {
   return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{}/{:L}"), __ym.year(), __ym.month());
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const year_month_day& __ymd) {
   return __os << (__ymd.ok() ? std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "{:%F}"), __ymd)
                              : std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "{:%F} is not a valid date"), __ymd));
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const year_month_day_last& __ymdl) {
   return __os << std::format(
              __os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{}/{:L}"), __ymdl.year(), __ymdl.month_day_last());
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const year_month_weekday& __ymwd) {
   return __os << std::format(
              __os.getloc(),
@@ -219,7 +218,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const year_month_weekday& __ymw
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const year_month_weekday_last& __ymwdl) {
   return __os << std::format(
              __os.getloc(),
@@ -230,7 +229,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const year_month_weekday_last&
 }
 
 template <class _CharT, class _Traits, class _Duration>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT basic_ostream<_CharT, _Traits>&
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const hh_mm_ss<_Duration> __hms) {
   return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%T}"), __hms);
 }
diff --git a/libcxx/include/__format/container_adaptor.h b/libcxx/include/__format/container_adaptor.h
index 9439b10c29914..5b95f600f0cdc 100644
--- a/libcxx/include/__format/container_adaptor.h
+++ b/libcxx/include/__format/container_adaptor.h
@@ -37,7 +37,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // adaptor headers. To use the format functions users already include <format>.
 
 template <class _Adaptor, class _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __formatter_container_adaptor {
+struct _LIBCPP_TEMPLATE_VIS __formatter_container_adaptor {
 private:
   using __maybe_const_container = __fmt_maybe_const<typename _Adaptor::container_type, _CharT>;
   using __maybe_const_adaptor   = __maybe_const<is_const_v<__maybe_const_container>, _Adaptor>;
@@ -57,15 +57,15 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __formatter_container_ad
 };
 
 template <class _CharT, class _Tp, formattable<_CharT> _Container>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<queue<_Tp, _Container>, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<queue<_Tp, _Container>, _CharT>
     : public __formatter_container_adaptor<queue<_Tp, _Container>, _CharT> {};
 
 template <class _CharT, class _Tp, class _Container, class _Compare>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<priority_queue<_Tp, _Container, _Compare>, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<priority_queue<_Tp, _Container, _Compare>, _CharT>
     : public __formatter_container_adaptor<priority_queue<_Tp, _Container, _Compare>, _CharT> {};
 
 template <class _CharT, class _Tp, formattable<_CharT> _Container>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<stack<_Tp, _Container>, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<stack<_Tp, _Container>, _CharT>
     : public __formatter_container_adaptor<stack<_Tp, _Container>, _CharT> {};
 
 #endif //_LIBCPP_STD_VER >= 23
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index a25976a0795e7..7e37dd4f0b377 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -95,7 +95,7 @@ constexpr __arg_t __get_packed_type(uint64_t __types, size_t __id) {
 // This function is not user obervable, so it can directly use the non-standard
 // types of the "variant". See __arg_t for more details.
 template <class _Visitor, class _Context>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT decltype(auto)
+_LIBCPP_HIDE_FROM_ABI decltype(auto)
 __visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
   switch (__arg.__type_) {
   case __format::__arg_t::__none:
@@ -225,7 +225,7 @@ class __basic_format_arg_value {
 };
 
 template <class _Context>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT basic_format_arg {
+class _LIBCPP_TEMPLATE_VIS basic_format_arg {
 public:
   class _LIBCPP_TEMPLATE_VIS handle;
 
@@ -277,7 +277,7 @@ class _LIBCPP_TEMPLATE_VIS basic_format_arg<_Context>::handle {
 // This function is user facing, so it must wrap the non-standard types of
 // the "variant" in a handle to stay conforming. See __arg_t for more details.
 template <class _Visitor, class _Context>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT decltype(auto)
+_LIBCPP_HIDE_FROM_ABI decltype(auto)
 visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
   switch (__arg.__type_) {
 #  ifndef _LIBCPP_HAS_NO_INT128
diff --git a/libcxx/include/__format/format_args.h b/libcxx/include/__format/format_args.h
index 32f1de97c2d1c..c2e7c96412377 100644
--- a/libcxx/include/__format/format_args.h
+++ b/libcxx/include/__format/format_args.h
@@ -27,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 
 template <class _Context>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT basic_format_args {
+class _LIBCPP_TEMPLATE_VIS basic_format_args {
 public:
   _LIBCPP_HIDE_FROM_ABI basic_format_args() noexcept = default;
 
diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h
index b8a9a54cf1e22..521131db84d80 100644
--- a/libcxx/include/__format/format_context.h
+++ b/libcxx/include/__format/format_context.h
@@ -41,7 +41,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _OutIt, class _CharT>
 requires output_iterator<_OutIt, const _CharT&>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT basic_format_context;
+class _LIBCPP_TEMPLATE_VIS basic_format_context;
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
 /**
@@ -80,7 +80,6 @@ requires output_iterator<_OutIt, const _CharT&>
 class
     // clang-format off
     _LIBCPP_TEMPLATE_VIS
-    _LIBCPP_AVAILABILITY_FORMAT
     _LIBCPP_PREFERRED_NAME(format_context)
     _LIBCPP_IF_WIDE_CHARACTERS(_LIBCPP_PREFERRED_NAME(wformat_context))
     // clang-format on
@@ -162,7 +161,7 @@ class
 // Here the width of an element in input is determined dynamically.
 // Note when the top-level element has no width the retargeting is not needed.
 template <class _CharT>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT
+class _LIBCPP_TEMPLATE_VIS
     basic_format_context<typename __format::__retarget_buffer<_CharT>::__iterator, _CharT> {
 public:
   using iterator  = typename __format::__retarget_buffer<_CharT>::__iterator;
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 218ae5b34c173..75afd92ce0566 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -344,7 +344,7 @@ struct _LIBCPP_TEMPLATE_VIS basic_format_string {
                            _Context{__types_.data(), __handles_.data(), sizeof...(_Args)});
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT constexpr basic_string_view<_CharT> get() const noexcept {
+  _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<_CharT> get() const noexcept {
     return __str_;
   }
 
@@ -409,21 +409,21 @@ requires(output_iterator<_OutIt, const _CharT&>) _LIBCPP_HIDE_FROM_ABI _OutIt
 // https://reviews.llvm.org/D110499#inline-1180704
 // TODO FMT Evaluate whether we want to file a Clang bug report regarding this.
 template <output_iterator<const char&> _OutIt>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _OutIt
 vformat_to(_OutIt __out_it, string_view __fmt, format_args __args) {
   return _VSTD::__vformat_to(_VSTD::move(__out_it), __fmt, __args);
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <output_iterator<const wchar_t&> _OutIt>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _OutIt
 vformat_to(_OutIt __out_it, wstring_view __fmt, wformat_args __args) {
   return _VSTD::__vformat_to(_VSTD::move(__out_it), __fmt, __args);
 }
 #endif
 
 template <output_iterator<const char&> _OutIt, class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _OutIt
 format_to(_OutIt __out_it, format_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::vformat_to(_VSTD::move(__out_it), __fmt.get(),
                            _VSTD::make_format_args(__args...));
@@ -431,14 +431,17 @@ format_to(_OutIt __out_it, format_string<_Args...> __fmt, _Args&&... __args) {
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <output_iterator<const wchar_t&> _OutIt, class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _OutIt
 format_to(_OutIt __out_it, wformat_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::vformat_to(_VSTD::move(__out_it), __fmt.get(),
                            _VSTD::make_wformat_args(__args...));
 }
 #endif
 
-_LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT string
+// TODO FMT This needs to be a template or std::to_chars(floating-point) availability markup
+// fires too eagerly, see http://llvm.org/PR61563.
+template <class = void>
+_LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string
 vformat(string_view __fmt, format_args __args) {
   string __res;
   _VSTD::vformat_to(_VSTD::back_inserter(__res), __fmt, __args);
@@ -446,7 +449,10 @@ vformat(string_view __fmt, format_args __args) {
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-_LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT wstring
+// TODO FMT This needs to be a template or std::to_chars(floating-point) availability markup
+// fires too eagerly, see http://llvm.org/PR61563.
+template <class = void>
+_LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring
 vformat(wstring_view __fmt, wformat_args __args) {
   wstring __res;
   _VSTD::vformat_to(_VSTD::back_inserter(__res), __fmt, __args);
@@ -455,14 +461,14 @@ vformat(wstring_view __fmt, wformat_args __args) {
 #endif
 
 template <class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT string format(format_string<_Args...> __fmt,
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI string format(format_string<_Args...> __fmt,
                                                                                       _Args&&... __args) {
   return _VSTD::vformat(__fmt.get(), _VSTD::make_format_args(__args...));
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT wstring
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI wstring
 format(wformat_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::vformat(__fmt.get(), _VSTD::make_wformat_args(__args...));
 }
@@ -479,14 +485,14 @@ _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt> __vformat_to_n(_OutIt __out_it,
 }
 
 template <output_iterator<const char&> _OutIt, class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT format_to_n_result<_OutIt>
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt>
 format_to_n(_OutIt __out_it, iter_difference_t<_OutIt> __n, format_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::__vformat_to_n<format_context>(_VSTD::move(__out_it), __n, __fmt.get(), _VSTD::make_format_args(__args...));
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <output_iterator<const wchar_t&> _OutIt, class... _Args>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT format_to_n_result<_OutIt>
+_LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt>
 format_to_n(_OutIt __out_it, iter_difference_t<_OutIt> __n, wformat_string<_Args...> __fmt,
             _Args&&... __args) {
   return _VSTD::__vformat_to_n<wformat_context>(_VSTD::move(__out_it), __n, __fmt.get(), _VSTD::make_wformat_args(__args...));
@@ -502,14 +508,14 @@ _LIBCPP_HIDE_FROM_ABI size_t __vformatted_size(basic_string_view<_CharT> __fmt,
 }
 
 template <class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT size_t
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI size_t
 formatted_size(format_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::__vformatted_size(__fmt.get(), basic_format_args{_VSTD::make_format_args(__args...)});
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT size_t
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI size_t
 formatted_size(wformat_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::__vformatted_size(__fmt.get(), basic_format_args{_VSTD::make_wformat_args(__args...)});
 }
@@ -536,7 +542,7 @@ requires(output_iterator<_OutIt, const _CharT&>) _LIBCPP_HIDE_FROM_ABI _OutIt
 }
 
 template <output_iterator<const char&> _OutIt>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt vformat_to(
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _OutIt vformat_to(
     _OutIt __out_it, locale __loc, string_view __fmt, format_args __args) {
   return _VSTD::__vformat_to(_VSTD::move(__out_it), _VSTD::move(__loc), __fmt,
                              __args);
@@ -544,7 +550,7 @@ _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt v
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <output_iterator<const wchar_t&> _OutIt>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt vformat_to(
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _OutIt vformat_to(
     _OutIt __out_it, locale __loc, wstring_view __fmt, wformat_args __args) {
   return _VSTD::__vformat_to(_VSTD::move(__out_it), _VSTD::move(__loc), __fmt,
                              __args);
@@ -552,7 +558,7 @@ _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt v
 #endif
 
 template <output_iterator<const char&> _OutIt, class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _OutIt
 format_to(_OutIt __out_it, locale __loc, format_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::vformat_to(_VSTD::move(__out_it), _VSTD::move(__loc), __fmt.get(),
                            _VSTD::make_format_args(__args...));
@@ -560,14 +566,17 @@ format_to(_OutIt __out_it, locale __loc, format_string<_Args...> __fmt, _Args&&.
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <output_iterator<const wchar_t&> _OutIt, class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT _OutIt
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _OutIt
 format_to(_OutIt __out_it, locale __loc, wformat_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::vformat_to(_VSTD::move(__out_it), _VSTD::move(__loc), __fmt.get(),
                            _VSTD::make_wformat_args(__args...));
 }
 #endif
 
-_LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT string
+// TODO FMT This needs to be a template or std::to_chars(floating-point) availability markup
+// fires too eagerly, see http://llvm.org/PR61563.
+template <class = void>
+_LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string
 vformat(locale __loc, string_view __fmt, format_args __args) {
   string __res;
   _VSTD::vformat_to(_VSTD::back_inserter(__res), _VSTD::move(__loc), __fmt,
@@ -576,7 +585,10 @@ vformat(locale __loc, string_view __fmt, format_args __args) {
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-_LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT wstring
+// TODO FMT This needs to be a template or std::to_chars(floating-point) availability markup
+// fires too eagerly, see http://llvm.org/PR61563.
+template <class = void>
+_LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring
 vformat(locale __loc, wstring_view __fmt, wformat_args __args) {
   wstring __res;
   _VSTD::vformat_to(_VSTD::back_inserter(__res), _VSTD::move(__loc), __fmt,
@@ -586,7 +598,7 @@ vformat(locale __loc, wstring_view __fmt, wformat_args __args) {
 #endif
 
 template <class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT string format(locale __loc,
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI string format(locale __loc,
                                                                                       format_string<_Args...> __fmt,
                                                                                       _Args&&... __args) {
   return _VSTD::vformat(_VSTD::move(__loc), __fmt.get(),
@@ -595,7 +607,7 @@ _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT string f
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT wstring
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI wstring
 format(locale __loc, wformat_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::vformat(_VSTD::move(__loc), __fmt.get(),
                         _VSTD::make_wformat_args(__args...));
@@ -614,7 +626,7 @@ _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt> __vformat_to_n(_OutIt __out_it,
 }
 
 template <output_iterator<const char&> _OutIt, class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT format_to_n_result<_OutIt>
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt>
 format_to_n(_OutIt __out_it, iter_difference_t<_OutIt> __n, locale __loc, format_string<_Args...> __fmt,
             _Args&&... __args) {
   return _VSTD::__vformat_to_n<format_context>(_VSTD::move(__out_it), __n, _VSTD::move(__loc), __fmt.get(),
@@ -623,7 +635,7 @@ format_to_n(_OutIt __out_it, iter_difference_t<_OutIt> __n, locale __loc, format
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <output_iterator<const wchar_t&> _OutIt, class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT format_to_n_result<_OutIt>
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt>
 format_to_n(_OutIt __out_it, iter_difference_t<_OutIt> __n, locale __loc, wformat_string<_Args...> __fmt,
             _Args&&... __args) {
   return _VSTD::__vformat_to_n<wformat_context>(_VSTD::move(__out_it), __n, _VSTD::move(__loc), __fmt.get(),
@@ -641,14 +653,14 @@ _LIBCPP_HIDE_FROM_ABI size_t __vformatted_size(locale __loc, basic_string_view<_
 }
 
 template <class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT size_t
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI size_t
 formatted_size(locale __loc, format_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::__vformatted_size(_VSTD::move(__loc), __fmt.get(), basic_format_args{_VSTD::make_format_args(__args...)});
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <class... _Args>
-_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FORMAT size_t
+_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI size_t
 formatted_size(locale __loc, wformat_string<_Args...> __fmt, _Args&&... __args) {
   return _VSTD::__vformatted_size(_VSTD::move(__loc), __fmt.get(), basic_format_args{_VSTD::make_wformat_args(__args...)});
 }
diff --git a/libcxx/include/__format/format_fwd.h b/libcxx/include/__format/format_fwd.h
index d3e573f893672..120b2fc8d47de 100644
--- a/libcxx/include/__format/format_fwd.h
+++ b/libcxx/include/__format/format_fwd.h
@@ -23,14 +23,14 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 
 template <class _Context>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT basic_format_arg;
+class _LIBCPP_TEMPLATE_VIS basic_format_arg;
 
 template <class _OutIt, class _CharT>
   requires output_iterator<_OutIt, const _CharT&>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT basic_format_context;
+class _LIBCPP_TEMPLATE_VIS basic_format_context;
 
 template <class _Tp, class _CharT = char>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter;
+struct _LIBCPP_TEMPLATE_VIS formatter;
 
 #endif //_LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__format/format_parse_context.h b/libcxx/include/__format/format_parse_context.h
index ac2f5a843405f..79f53f77d4a05 100644
--- a/libcxx/include/__format/format_parse_context.h
+++ b/libcxx/include/__format/format_parse_context.h
@@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 
 template <class _CharT>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT basic_format_parse_context {
+class _LIBCPP_TEMPLATE_VIS basic_format_parse_context {
 public:
   using char_type = _CharT;
   using const_iterator = typename basic_string_view<_CharT>::const_iterator;
diff --git a/libcxx/include/__format/formatter.h b/libcxx/include/__format/formatter.h
index e2c58889c5e17..172b2d5f7b8a1 100644
--- a/libcxx/include/__format/formatter.h
+++ b/libcxx/include/__format/formatter.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 /// - is_copy_assignable<F>, and
 /// - is_move_assignable<F>.
 template <class _Tp, class _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter {
+struct _LIBCPP_TEMPLATE_VIS formatter {
   formatter() = delete;
   formatter(const formatter&) = delete;
   formatter& operator=(const formatter&) = delete;
diff --git a/libcxx/include/__format/formatter_bool.h b/libcxx/include/__format/formatter_bool.h
index 84f8bcfa629bf..1fb75755fc572 100644
--- a/libcxx/include/__format/formatter_bool.h
+++ b/libcxx/include/__format/formatter_bool.h
@@ -36,7 +36,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<bool, _CharT> {
+struct _LIBCPP_TEMPLATE_VIS formatter<bool, _CharT> {
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr auto
   parse(basic_format_parse_context<_CharT>& __parse_ctx) -> decltype(__parse_ctx.begin()) {
diff --git a/libcxx/include/__format/formatter_char.h b/libcxx/include/__format/formatter_char.h
index eaac70cbe18ee..7d63c042c554f 100644
--- a/libcxx/include/__format/formatter_char.h
+++ b/libcxx/include/__format/formatter_char.h
@@ -31,7 +31,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __formatter_char {
+struct _LIBCPP_TEMPLATE_VIS __formatter_char {
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr auto
   parse(basic_format_parse_context<_CharT>& __parse_ctx) -> decltype(__parse_ctx.begin()) {
@@ -74,14 +74,14 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __formatter_char {
 };
 
 template <>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<char, char> : public __formatter_char<char> {};
+struct _LIBCPP_TEMPLATE_VIS formatter<char, char> : public __formatter_char<char> {};
 
 #  ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 template <>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<char, wchar_t> : public __formatter_char<wchar_t> {};
+struct _LIBCPP_TEMPLATE_VIS formatter<char, wchar_t> : public __formatter_char<wchar_t> {};
 
 template <>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<wchar_t, wchar_t> : public __formatter_char<wchar_t> {
+struct _LIBCPP_TEMPLATE_VIS formatter<wchar_t, wchar_t> : public __formatter_char<wchar_t> {
 };
 
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index 31cd012e484ad..cac74c1f2a62c 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -739,13 +739,13 @@ struct _LIBCPP_TEMPLATE_VIS __formatter_floating_point {
 };
 
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<float, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<float, _CharT>
     : public __formatter_floating_point<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<double, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<double, _CharT>
     : public __formatter_floating_point<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<long double, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<long double, _CharT>
     : public __formatter_floating_point<_CharT> {};
 
 #endif //_LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__format/formatter_integer.h b/libcxx/include/__format/formatter_integer.h
index f157698818ac8..0e144100da9ab 100644
--- a/libcxx/include/__format/formatter_integer.h
+++ b/libcxx/include/__format/formatter_integer.h
@@ -31,7 +31,7 @@
 #if _LIBCPP_STD_VER >= 20
 
     template <__fmt_char_type _CharT>
-    struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __formatter_integer {
+    struct _LIBCPP_TEMPLATE_VIS __formatter_integer {
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr auto
@@ -60,43 +60,43 @@
 
 // Signed integral types.
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<signed char, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<signed char, _CharT>
     : public __formatter_integer<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<short, _CharT> : public __formatter_integer<_CharT> {
+struct _LIBCPP_TEMPLATE_VIS formatter<short, _CharT> : public __formatter_integer<_CharT> {
 };
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<int, _CharT> : public __formatter_integer<_CharT> {};
+struct _LIBCPP_TEMPLATE_VIS formatter<int, _CharT> : public __formatter_integer<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<long, _CharT> : public __formatter_integer<_CharT> {};
+struct _LIBCPP_TEMPLATE_VIS formatter<long, _CharT> : public __formatter_integer<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<long long, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<long long, _CharT>
     : public __formatter_integer<_CharT> {};
 #  ifndef _LIBCPP_HAS_NO_INT128
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<__int128_t, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<__int128_t, _CharT>
     : public __formatter_integer<_CharT> {};
 #  endif
 
 // Unsigned integral types.
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<unsigned char, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<unsigned char, _CharT>
     : public __formatter_integer<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<unsigned short, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<unsigned short, _CharT>
     : public __formatter_integer<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<unsigned, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<unsigned, _CharT>
     : public __formatter_integer<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<unsigned long, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<unsigned long, _CharT>
     : public __formatter_integer<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<unsigned long long, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<unsigned long long, _CharT>
     : public __formatter_integer<_CharT> {};
 #  ifndef _LIBCPP_HAS_NO_INT128
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<__uint128_t, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<__uint128_t, _CharT>
     : public __formatter_integer<_CharT> {};
 #  endif
 
diff --git a/libcxx/include/__format/formatter_pointer.h b/libcxx/include/__format/formatter_pointer.h
index fe1b3cb496f1a..48d8372a2341f 100644
--- a/libcxx/include/__format/formatter_pointer.h
+++ b/libcxx/include/__format/formatter_pointer.h
@@ -55,13 +55,13 @@ struct _LIBCPP_TEMPLATE_VIS __formatter_pointer {
 // - template<> struct formatter<void*, charT>;
 // - template<> struct formatter<const void*, charT>;
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<nullptr_t, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<nullptr_t, _CharT>
     : public __formatter_pointer<_CharT> {};
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<void*, _CharT> : public __formatter_pointer<_CharT> {
+struct _LIBCPP_TEMPLATE_VIS formatter<void*, _CharT> : public __formatter_pointer<_CharT> {
 };
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<const void*, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<const void*, _CharT>
     : public __formatter_pointer<_CharT> {};
 
 #endif //_LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__format/formatter_string.h b/libcxx/include/__format/formatter_string.h
index e11708d8e28cc..c14518be495bc 100644
--- a/libcxx/include/__format/formatter_string.h
+++ b/libcxx/include/__format/formatter_string.h
@@ -57,7 +57,7 @@ struct _LIBCPP_TEMPLATE_VIS __formatter_string {
 
 // Formatter const char*.
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<const _CharT*, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<const _CharT*, _CharT>
     : public __formatter_string<_CharT> {
   using _Base = __formatter_string<_CharT>;
 
@@ -95,7 +95,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<const _CharT*,
 
 // Formatter char*.
 template <__fmt_char_type _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<_CharT*, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<_CharT*, _CharT>
     : public formatter<const _CharT*, _CharT> {
   using _Base = formatter<const _CharT*, _CharT>;
 
@@ -106,7 +106,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<_CharT*, _Char
 
 // Formatter char[].
 template <__fmt_char_type _CharT, size_t _Size>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<_CharT[_Size], _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<_CharT[_Size], _CharT>
     : public __formatter_string<_CharT> {
   using _Base = __formatter_string<_CharT>;
 
@@ -117,7 +117,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<_CharT[_Size],
 
 // Formatter std::string.
 template <__fmt_char_type _CharT, class _Traits, class _Allocator>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<basic_string<_CharT, _Traits, _Allocator>, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<basic_string<_CharT, _Traits, _Allocator>, _CharT>
     : public __formatter_string<_CharT> {
   using _Base = __formatter_string<_CharT>;
 
@@ -130,7 +130,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<basic_string<_
 
 // Formatter std::string_view.
 template <__fmt_char_type _CharT, class _Traits>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<basic_string_view<_CharT, _Traits>, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<basic_string_view<_CharT, _Traits>, _CharT>
     : public __formatter_string<_CharT> {
   using _Base = __formatter_string<_CharT>;
 
diff --git a/libcxx/include/__format/formatter_tuple.h b/libcxx/include/__format/formatter_tuple.h
index d1874ddecb723..e6831de78c227 100644
--- a/libcxx/include/__format/formatter_tuple.h
+++ b/libcxx/include/__format/formatter_tuple.h
@@ -39,7 +39,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 23
 
 template <__fmt_char_type _CharT, class _Tuple, formattable<_CharT>... _Args>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __formatter_tuple {
+struct _LIBCPP_TEMPLATE_VIS __formatter_tuple {
   _LIBCPP_HIDE_FROM_ABI constexpr void set_separator(basic_string_view<_CharT> __separator) noexcept {
     __separator_ = __separator;
   }
@@ -164,11 +164,11 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __formatter_tuple {
 };
 
 template <__fmt_char_type _CharT, formattable<_CharT>... _Args>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<pair<_Args...>, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<pair<_Args...>, _CharT>
     : public __formatter_tuple<_CharT, pair<_Args...>, _Args...> {};
 
 template <__fmt_char_type _CharT, formattable<_CharT>... _Args>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<tuple<_Args...>, _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<tuple<_Args...>, _CharT>
     : public __formatter_tuple<_CharT, tuple<_Args...>, _Args...> {};
 
 #endif //_LIBCPP_STD_VER >= 23
diff --git a/libcxx/include/__format/range_default_formatter.h b/libcxx/include/__format/range_default_formatter.h
index 7fdb254de3b88..eab2951fcf552 100644
--- a/libcxx/include/__format/range_default_formatter.h
+++ b/libcxx/include/__format/range_default_formatter.h
@@ -84,12 +84,12 @@ inline constexpr range_format format_kind<_Rp> = [] {
 }();
 
 template <range_format _Kp, ranges::input_range _Rp, class _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __range_default_formatter;
+struct _LIBCPP_TEMPLATE_VIS __range_default_formatter;
 
 // Required specializations
 
 template <ranges::input_range _Rp, class _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __range_default_formatter<range_format::sequence, _Rp, _CharT> {
+struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::sequence, _Rp, _CharT> {
 private:
   using __maybe_const_r = __fmt_maybe_const<_Rp, _CharT>;
   range_formatter<remove_cvref_t<ranges::range_reference_t<__maybe_const_r>>, _CharT> __underlying_;
@@ -115,7 +115,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __range_default_formatte
 };
 
 template <ranges::input_range _Rp, class _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __range_default_formatter<range_format::map, _Rp, _CharT> {
+struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::map, _Rp, _CharT> {
 private:
   using __maybe_const_map = __fmt_maybe_const<_Rp, _CharT>;
   using __element_type    = remove_cvref_t<ranges::range_reference_t<__maybe_const_map>>;
@@ -143,7 +143,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __range_default_formatte
 };
 
 template <ranges::input_range _Rp, class _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __range_default_formatter<range_format::set, _Rp, _CharT> {
+struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::set, _Rp, _CharT> {
 private:
   using __maybe_const_set = __fmt_maybe_const<_Rp, _CharT>;
   using __element_type    = remove_cvref_t<ranges::range_reference_t<__maybe_const_set>>;
@@ -168,14 +168,13 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __range_default_formatte
 
 template <range_format _Kp, ranges::input_range _Rp, class _CharT>
   requires(_Kp == range_format::string || _Kp == range_format::debug_string)
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT __range_default_formatter<_Kp, _Rp, _CharT> {
+struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<_Kp, _Rp, _CharT> {
   __range_default_formatter() = delete; // TODO FMT Implement
 };
 
 template <ranges::input_range _Rp, class _CharT>
   requires(format_kind<_Rp> != range_format::disabled && formattable<ranges::range_reference_t<_Rp>, _CharT>)
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<_Rp, _CharT>
-    : __range_default_formatter<format_kind<_Rp>, _Rp, _CharT> {};
+struct _LIBCPP_TEMPLATE_VIS formatter<_Rp, _CharT> : __range_default_formatter<format_kind<_Rp>, _Rp, _CharT> {};
 
 #endif //_LIBCPP_STD_VER >= 23
 
diff --git a/libcxx/include/__format/range_formatter.h b/libcxx/include/__format/range_formatter.h
index f60151029c9c3..47323433d76fa 100644
--- a/libcxx/include/__format/range_formatter.h
+++ b/libcxx/include/__format/range_formatter.h
@@ -40,7 +40,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _CharT = char>
   requires same_as<remove_cvref_t<_Tp>, _Tp> && formattable<_Tp, _CharT>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT range_formatter {
+struct _LIBCPP_TEMPLATE_VIS range_formatter {
   _LIBCPP_HIDE_FROM_ABI constexpr void set_separator(basic_string_view<_CharT> __separator) noexcept {
     __separator_ = __separator;
   }
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 980af582558be..2361ba8e215ff 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -3328,7 +3328,7 @@ inline constexpr bool __format::__enable_insertable<vector<wchar_t>> = true;
 template <class _Tp, class CharT>
 // Since is-vector-bool-reference is only used once it's inlined here.
   requires same_as<typename _Tp::__container, vector<bool, typename _Tp::__container::allocator_type>>
-struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<_Tp, CharT> {
+struct _LIBCPP_TEMPLATE_VIS formatter<_Tp, CharT> {
 private:
   formatter<bool, CharT> __underlying_;
 
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
index 67a34ef637eec..6753f614351d8 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // [container.adaptors.format]
 // For each of queue, priority_queue, and stack, the library provides the
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
index a7136e7e404e8..ab9b7e1004694 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // [container.adaptors.format]
 // For each of queue, priority_queue, and stack, the library provides the
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
index 8950ece9a57e8..9f978ebbbf63e 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // [container.adaptors.format]
 // For each of queue, priority_queue, and stack, the library provides the
 // following formatter specialization where adaptor-type is the name of the
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
index b620279d76079..7a5f3ead39010 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // [container.adaptors.format]
 // For each of queue, priority_queue, and stack, the library provides the
 // following formatter specialization where adaptor-type is the name of the
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
index 05a0715c321ed..4f8e0337d652f 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
@@ -11,6 +11,10 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <vector>
 
 // template<class T, class charT>
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
index c5a623795957a..c94aedceedb89 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <vector>
 
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
index acb517a068f4c..e20ea9b33035a 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <vector>
 
 // template<class T, class charT>
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
index c099cfe1e5965..c6013ce7690dc 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <vector>
 
 // template<class T, class charT>
diff --git a/libcxx/test/std/time/time.cal/time.cal.day/time.cal.day.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.day/time.cal.day.nonmembers/ostream.pass.cpp
index 0e1730447e76f..b30ed775b87f0 100644
--- a/libcxx/test/std/time/time.cal/time.cal.day/time.cal.day.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.day/time.cal.day.nonmembers/ostream.pass.cpp
@@ -15,6 +15,10 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.md/time.cal.md.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.md/time.cal.md.nonmembers/ostream.pass.cpp
index 4e4feda0c8091..de25ac5ed86d0 100644
--- a/libcxx/test/std/time/time.cal/time.cal.md/time.cal.md.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.md/time.cal.md.nonmembers/ostream.pass.cpp
@@ -16,6 +16,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.mdlast/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.mdlast/ostream.pass.cpp
index 491d30bf969e2..52424227ca9af 100644
--- a/libcxx/test/std/time/time.cal/time.cal.mdlast/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.mdlast/ostream.pass.cpp
@@ -18,6 +18,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.month/time.cal.month.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.month/time.cal.month.nonmembers/ostream.pass.cpp
index 2efc023c611b6..86862cc9711f3 100644
--- a/libcxx/test/std/time/time.cal/time.cal.month/time.cal.month.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.month/time.cal.month.nonmembers/ostream.pass.cpp
@@ -15,6 +15,10 @@
 // TODO FMT Investigate Windows issues.
 // UNSUPPORTED: msvc, target={{.+}}-windows-gnu
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.mwd/time.cal.mwd.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.mwd/time.cal.mwd.nonmembers/ostream.pass.cpp
index 006df5e29e56b..7fe5611e1496a 100644
--- a/libcxx/test/std/time/time.cal/time.cal.mwd/time.cal.mwd.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.mwd/time.cal.mwd.nonmembers/ostream.pass.cpp
@@ -18,6 +18,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.mwdlast/time.cal.mwdlast.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.mwdlast/time.cal.mwdlast.nonmembers/ostream.pass.cpp
index 0a76fbd471d88..677219a0ee237 100644
--- a/libcxx/test/std/time/time.cal/time.cal.mwdlast/time.cal.mwdlast.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.mwdlast/time.cal.mwdlast.nonmembers/ostream.pass.cpp
@@ -18,6 +18,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.wdidx/time.cal.wdidx.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.wdidx/time.cal.wdidx.nonmembers/ostream.pass.cpp
index 7a59350ffea10..f52406affaead 100644
--- a/libcxx/test/std/time/time.cal/time.cal.wdidx/time.cal.wdidx.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.wdidx/time.cal.wdidx.nonmembers/ostream.pass.cpp
@@ -13,6 +13,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.wdlast/time.cal.wdlast.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.wdlast/time.cal.wdlast.nonmembers/ostream.pass.cpp
index 90dca782880a5..86fbc6d7b185a 100644
--- a/libcxx/test/std/time/time.cal/time.cal.wdlast/time.cal.wdlast.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.wdlast/time.cal.wdlast.nonmembers/ostream.pass.cpp
@@ -13,6 +13,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/ostream.pass.cpp
index b52293b16d978..59c6bafdd1353 100644
--- a/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/ostream.pass.cpp
@@ -10,6 +10,10 @@
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.year/time.cal.year.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.year/time.cal.year.nonmembers/ostream.pass.cpp
index c979fc2874ca7..1ba7f6eebd011 100644
--- a/libcxx/test/std/time/time.cal/time.cal.year/time.cal.year.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.year/time.cal.year.nonmembers/ostream.pass.cpp
@@ -13,6 +13,10 @@
 // TODO FMT Investigate Windows issues.
 // UNSUPPORTED: msvc, target={{.+}}-windows-gnu
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/ostream.pass.cpp
index 13257ccbbb475..624bf4d4f1fd8 100644
--- a/libcxx/test/std/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.ym/time.cal.ym.nonmembers/ostream.pass.cpp
@@ -18,6 +18,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp
index 595e00f0008c8..64a42ff081602 100644
--- a/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp
@@ -16,6 +16,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.ymdlast/time.cal.ymdlast.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.ymdlast/time.cal.ymdlast.nonmembers/ostream.pass.cpp
index 9ded734a83548..254fe6a846d83 100644
--- a/libcxx/test/std/time/time.cal/time.cal.ymdlast/time.cal.ymdlast.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.ymdlast/time.cal.ymdlast.nonmembers/ostream.pass.cpp
@@ -18,6 +18,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.ymwd/time.cal.ymwd.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.ymwd/time.cal.ymwd.nonmembers/ostream.pass.cpp
index 05578f6c2397b..76c74b80945b5 100644
--- a/libcxx/test/std/time/time.cal/time.cal.ymwd/time.cal.ymwd.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.ymwd/time.cal.ymwd.nonmembers/ostream.pass.cpp
@@ -16,6 +16,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/ostream.pass.cpp
index 150813379e276..478e2b88d2a91 100644
--- a/libcxx/test/std/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.ymwdlast/time.cal.ymwdlast.nonmembers/ostream.pass.cpp
@@ -16,6 +16,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
index e05146686434a..416a472de5085 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
@@ -15,6 +15,10 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.hms/time.hms.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.hms/time.hms.nonmembers/ostream.pass.cpp
index c639ea8dcec42..5aef2140e1f78 100644
--- a/libcxx/test/std/time/time.hms/time.hms.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.hms/time.hms.nonmembers/ostream.pass.cpp
@@ -16,6 +16,9 @@
 // TODO FMT Investigate Windows issues.
 // UNSUPPORTED: msvc, target={{.+}}-windows-gnu
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
index 373db7217e14d..2329e1b6e2451 100644
--- a/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
@@ -14,6 +14,10 @@
 // TODO FMT Investigate Windows issues.
 // UNSUPPORTED: msvc, target={{.+}}-windows-gnu
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
index 3ca7ae2b40c91..fd9d095603a81 100644
--- a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
@@ -16,6 +16,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
index 2fbca0e1aece3..a0f1ec0f8164b 100644
--- a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
@@ -17,6 +17,9 @@
 
 // XFAIL: LIBCXX-FREEBSD-FIXME
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.month.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
index 324887f8af8c5..d479679f88498 100644
--- a/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
@@ -11,6 +11,10 @@
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
index f86476f716325..cce2832509394 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
@@ -17,6 +17,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
index 7c78271b908bb..60ddd8ba47759 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
@@ -14,6 +14,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
index f34b22c00cfe6..207cc09c166e3 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
@@ -14,6 +14,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
index 1b3fbe9f59adb..de8172e98282d 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
@@ -14,6 +14,10 @@
 // TODO FMT Investigate Windows issues.
 // UNSUPPORTED: msvc, target={{.+}}-windows-gnu
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
index 54930343a8d7a..a063d23427a89 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
@@ -17,6 +17,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
index e3fa9ae3ba3dd..a33c57481d0a0 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
@@ -17,6 +17,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.year.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
index beb47321e6644..7907033828cf2 100644
--- a/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
@@ -14,6 +14,10 @@
 // TODO FMT Investigate Windows issues.
 // UNSUPPORTED: msvc, target={{.+}}-windows-gnu
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
index 344967d41f774..d7c65bb62ad7c 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
@@ -14,6 +14,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
index 25d5a5807467b..22fada55d5768 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
@@ -17,6 +17,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
index 35ce599e1a08a..5ffa9e3d9cd9f 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
@@ -14,6 +14,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
index 617f183882202..775fe81fea80f 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
@@ -14,6 +14,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
index 50968a39dbe64..166cc616888d6 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
@@ -12,6 +12,10 @@
 // TODO FMT It seems GCC uses too much memory in the CI and fails.
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
index 2e602428df484..44fee37d40245 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
@@ -22,10 +22,10 @@
 #include "test_macros.h"
 
 int main(int, char**) {
-  [[maybe_unused]] auto store = std::make_format_args(42, nullptr, false, 1.0);
+  [[maybe_unused]] auto store = std::make_format_args(42, nullptr, false, 'x');
 
   LIBCPP_STATIC_ASSERT(
-      std::same_as<decltype(store), std::__format_arg_store<std::format_context, int, nullptr_t, bool, double>>);
+      std::same_as<decltype(store), std::__format_arg_store<std::format_context, int, nullptr_t, bool, char>>);
 
   return 0;
 }
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
index 14328a1425d08..feb23c503a21f 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
@@ -22,10 +22,10 @@
 #include "test_macros.h"
 
 int main(int, char**) {
-  [[maybe_unused]] auto store = std::make_wformat_args(42, nullptr, false, 1.0);
+  [[maybe_unused]] auto store = std::make_wformat_args(42, nullptr, false, 'x');
 
   LIBCPP_STATIC_ASSERT(
-      std::same_as<decltype(store), std::__format_arg_store<std::wformat_context, int, nullptr_t, bool, double>>);
+      std::same_as<decltype(store), std::__format_arg_store<std::wformat_context, int, nullptr_t, bool, char>>);
 
   return 0;
 }
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
index 29092a35d711c..c67d868dcfebe 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
@@ -8,9 +8,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-
 // <format>
 
 // template<class Visitor, class Context>
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
index 35bee3ecce59c..ecb055e3026bc 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
@@ -8,9 +8,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-
 // <format>
 
 // basic_format_arg<Context> get(size_t i) const noexcept;
diff --git a/libcxx/test/std/utilities/format/format.error/format.error.pass.cpp b/libcxx/test/std/utilities/format/format.error/format.error.pass.cpp
index e3d3c48995bc1..c2d3d6b2c8e95 100644
--- a/libcxx/test/std/utilities/format/format.error/format.error.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.error/format.error.pass.cpp
@@ -9,9 +9,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-
 // <format>
 
 // class format_error;
diff --git a/libcxx/test/std/utilities/format/format.fmt.string/ctor.verify.cpp b/libcxx/test/std/utilities/format/format.fmt.string/ctor.verify.cpp
index 8f5404daaf396..d51531dacf734 100644
--- a/libcxx/test/std/utilities/format/format.fmt.string/ctor.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.fmt.string/ctor.verify.cpp
@@ -33,14 +33,14 @@ void run() {
   (void)std::basic_format_string<char>{"{}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
   (void)std::basic_format_string<char, int>{"{0:{0}P}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
   (void)std::basic_format_string<char, int>{"{0:{0}}"};
-  (void)std::basic_format_string<char, float>{"{0:{0}}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
+  (void)std::basic_format_string<char, bool>{"{0:{0}}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
   (void)std::basic_format_string<char, int>{"{.3}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
   (void)std::basic_format_string<wchar_t>{L"foo"};
   (void)std::basic_format_string<wchar_t>{L"{}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
   (void)std::basic_format_string<wchar_t, int>{L"{0:{0}P}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
   (void)std::basic_format_string<wchar_t, int>{L"{0:{0}}"};
-  (void)std::basic_format_string<wchar_t, float>{L"{0:{0}}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
+  (void)std::basic_format_string<wchar_t, bool>{L"{0:{0}}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
   (void)std::basic_format_string<wchar_t, int>{L"{.3}"}; // expected-error-re {{call to consteval function{{.*}}is not a constant expression}}
 #endif
 }
diff --git a/libcxx/test/std/utilities/format/format.fmt.string/get.pass.cpp b/libcxx/test/std/utilities/format/format.fmt.string/get.pass.cpp
index bf7e2add0e8f5..d9a8c9f719b9a 100644
--- a/libcxx/test/std/utilities/format/format.fmt.string/get.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.fmt.string/get.pass.cpp
@@ -35,10 +35,10 @@ template <class CharT>
 constexpr bool test() {
   assert((std::basic_format_string<CharT>{CSTR("foo")}.get() == SV("foo")));
   assert((std::basic_format_string<CharT, int>{CSTR("{}")}.get() == SV("{}")));
-  assert((std::basic_format_string<CharT, int, float>{CSTR("{} {:01.23L}")}.get() == SV("{} {:01.23L}")));
+  assert((std::basic_format_string<CharT, int, char>{CSTR("{} {:*>6}")}.get() == SV("{} {:*>6}")));
 
   // Embedded NUL character
-  assert((std::basic_format_string<CharT, void*, double>{SV("{}\0{}")}.get() == SV("{}\0{}")));
+  assert((std::basic_format_string<CharT, void*, bool>{SV("{}\0{}")}.get() == SV("{}\0{}")));
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/format/format.fmt.string/types.compile.pass.cpp b/libcxx/test/std/utilities/format/format.fmt.string/types.compile.pass.cpp
index 3ebd2bfc4fbd5..1ecfb5d992741 100644
--- a/libcxx/test/std/utilities/format/format.fmt.string/types.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.fmt.string/types.compile.pass.cpp
@@ -29,12 +29,11 @@
 
 static_assert(std::same_as<std::format_string<>, std::basic_format_string<char>>);
 static_assert(std::same_as<std::format_string<int>, std::basic_format_string<char, int>>);
-static_assert(std::same_as<std::format_string<int, float>, std::basic_format_string<char, int, float>>);
-static_assert(std::same_as<std::format_string<int, float, void*>, std::basic_format_string<char, int, float, void*>>);
+static_assert(std::same_as<std::format_string<int, bool>, std::basic_format_string<char, int, bool>>);
+static_assert(std::same_as<std::format_string<int, bool, void*>, std::basic_format_string<char, int, bool, void*>>);
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
 static_assert(std::same_as<std::wformat_string<>, std::basic_format_string<wchar_t>>);
 static_assert(std::same_as<std::wformat_string<int>, std::basic_format_string<wchar_t, int>>);
-static_assert(std::same_as<std::wformat_string<int, float>, std::basic_format_string<wchar_t, int, float>>);
-static_assert(
-    std::same_as<std::wformat_string<int, float, void*>, std::basic_format_string<wchar_t, int, float, void*>>);
+static_assert(std::same_as<std::wformat_string<int, bool>, std::basic_format_string<wchar_t, int, bool>>);
+static_assert(std::same_as<std::wformat_string<int, bool, void*>, std::basic_format_string<wchar_t, int, bool, void*>>);
 #endif
diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
index 0e4708e068ec4..54c8c1bd1f170 100644
--- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
@@ -8,6 +8,9 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// This test uses std::filesystem::path, which was introduced in macOS 10.15
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+
 // <format>
 
 // template<class T, class charT>
@@ -107,9 +110,7 @@ void test_P0645() {
   assert_is_formattable<__uint128_t, CharT>();
 #endif
 
-  assert_is_formattable<float, CharT>();
-  assert_is_formattable<double, CharT>();
-  assert_is_formattable<long double, CharT>();
+  // floating-point types are tested in concept.formattable.float.compile.pass.cpp
 
   assert_is_formattable<std::nullptr_t, CharT>();
   assert_is_formattable<void*, CharT>();
diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp
new file mode 100644
index 0000000000000..09b957f9d1682
--- /dev/null
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: libcpp-has-no-incomplete-format
+
+// <format>
+
+// template<class T, class charT>
+// concept formattable = ...
+
+#include <concepts>
+#include <format>
+
+#include "test_macros.h"
+
+template <class T, class CharT>
+void assert_is_not_formattable() {
+  static_assert(!std::formattable<T, CharT>);
+}
+
+template <class T, class CharT>
+void assert_is_formattable() {
+  // Only formatters for CharT == char || CharT == wchar_t are enabled for the
+  // standard formatters. When CharT is a different type the formatter should
+  // be disabled.
+  if constexpr (std::same_as<CharT, char>
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+                || std::same_as<CharT, wchar_t>
+#endif
+  )
+    static_assert(std::formattable<T, CharT>);
+  else
+    assert_is_not_formattable<T, CharT>();
+}
+
+template <class CharT>
+void test() {
+  assert_is_formattable<float, CharT>();
+  assert_is_formattable<double, CharT>();
+  assert_is_formattable<long double, CharT>();
+}
+
+void test() {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+  test<char8_t>();
+  test<char16_t>();
+  test<char32_t>();
+
+  test<int>();
+}
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
index dc40acecc8a19..9f701dfd015a8 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
@@ -8,6 +8,9 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // [format.formatter.spec]:
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
index bc6b418862525..f106105b984cf 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
@@ -9,9 +9,6 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-
 // <format>
 
 // constexpr void check_arg_id(size_t id);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
index 2d6de1f2f3354..03da8fde392bb 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
@@ -9,9 +9,6 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
-
 // <format>
 
 // constexpr size_t next_arg_id();
diff --git a/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp b/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
index 9a5baecbb5f97..1e6893d093451 100644
--- a/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
@@ -10,6 +10,10 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Tests whether a move only type can be formatted. This is required by
 // P2418R2 "Add support for std::generator-like types to std::format"
 
diff --git a/libcxx/test/std/utilities/format/format.functions/ascii.pass.cpp b/libcxx/test/std/utilities/format/format.functions/ascii.pass.cpp
index 4b94ebb9ec6b6..74f20f2d797d0 100644
--- a/libcxx/test/std/utilities/format/format.functions/ascii.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/ascii.pass.cpp
@@ -11,6 +11,10 @@
 // Force unicode to be disabled.
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_HAS_NO_UNICODE
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // Tests Unicode is ignored and handled as ASCII.
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp
index 5ceedf9f05c42..911e938887f97 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp
@@ -8,6 +8,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Force unicode to be disabled.
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_HAS_NO_UNICODE
 
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index 6d002a10c1479..e3ab2d16bd26f 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -15,6 +15,10 @@
 // UNSUPPORTED: msvc, target={{.+}}-windows-gnu
 // UNSUPPORTED: LIBCXX-AIX-FIXME
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // This test the debug string type for the formatter specializations for char
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
index 61c7abd8bedc8..eb90c75da8a05 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
@@ -11,6 +11,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED:gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class... Args>
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
index de182c7cb528b..1281a0b61e2a8 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Basic test to validate ill-formed code is properly detected.
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.functions/format.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
index 6b06fcf68d845..f444c51d511b0 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
@@ -10,6 +10,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Note this formatter shows additional information when tests are failing.
 // This aids the development. Since other formatters fail in the same fashion
 // they don't have this additional output.
diff --git a/libcxx/test/std/utilities/format/format.functions/format.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
index 23c9c2c103f1b..45829313a6267 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
@@ -8,6 +8,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Basic test to validate ill-formed code is properly detected.
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
index fd3eb06c95ffc..7079570813f20 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
@@ -11,6 +11,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class Out, class... Args>
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
index e3990603a5fc8..573257c228cc8 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Basic test to validate ill-formed code is properly detected.
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
index c81a902ab62b0..cf8d55714728b 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
@@ -10,6 +10,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class Out, class... Args>
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
index a40b532dcc922..bd1fcc2b4b777 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
@@ -8,6 +8,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // Basic test to validate ill-formed code is properly detected.
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
index 8a9a5395dcc49..de32982c1562f 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
@@ -11,6 +11,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class Out, class... Args>
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
index ec4e2927c7368..b9d77de3f90f1 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Basic test to validate ill-formed code is properly detected.
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
index b5effa320aab2..cfdeae9fd9fa6 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
@@ -10,6 +10,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class Out, class... Args>
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
index 7c064c584c8bb..b84615d590152 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
@@ -8,6 +8,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Basic test to validate ill-formed code is properly detected.
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
index 7d41ddb81a00f..43800b9da8a9f 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
@@ -11,6 +11,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class... Args>
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
index dc80c12141c5f..d1b92d3fa0847 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Basic test to validate ill-formed code is properly detected.
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
index cd31da125c849..6b03d34d9b271 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
@@ -10,6 +10,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class... Args>
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
index 1662d893221bb..a135dd7cc9706 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
@@ -8,6 +8,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // Basic test to validate ill-formed code is properly detected.
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp b/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
index d5939e255423e..9863922f9abcc 100644
--- a/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
@@ -12,6 +12,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // REQUIRES: locale.en_US.UTF-8
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
index db659f53cf097..efe243573f04a 100644
--- a/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
@@ -15,6 +15,10 @@
 // UNSUPPORTED msvc, target={{.+}}-windows-gnu
 // UNSUPPORTED: LIBCXX-AIX-FIXME
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // Tests the Unicode width support of the standard format specifiers.
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
index 4136123d6a9f5..7755b785518f3 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
@@ -11,6 +11,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // string vformat(const locale& loc, string_view fmt, format_args args);
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
index 8a95b5524fe9f..7c4c4a10ea5d8 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
@@ -10,6 +10,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // string vformat(string_view fmt, format_args args);
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
index 6c5e7f4ab5787..e1a740253d586 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
@@ -11,6 +11,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class Out>
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
index 77c783411127f..92b5409e1409e 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
@@ -10,6 +10,9 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 // <format>
 
 // template<class Out>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
index 31b7e5658b125..8f398994d251b 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
index a7a160989b336..c848f4ff2fc26 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
index 6405b5ec22e51..c17edb28f6175 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
index 21cee612bb2b0..3f91e7bc633a4 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
index 7179a674a37ad..c2d2ec2968508 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
@@ -8,6 +8,9 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// This test uses std::filesystem::path, which was introduced in macOS 10.15
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+
 // <format>
 
 // template<ranges::input_range R>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
index 7a0a2d18913cf..b3c8afd8bae3e 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
index 613eb5ea06392..d6b0f7e9f1c0f 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
index b459ba8ff2f2d..2275baee237cf 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
index 92763be54e15a..be117a6de2ecf 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
index 7df3284f72b71..1f6a550ee232f 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
index e411b036acbe1..e33a0bc383d14 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
index 5ca3bfe12012e..dcb3d67270019 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
index 8b25021c984d0..88940525925fa 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
index e1ab825626b56..297f7b22779f1 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
index 98bda2debb670..c28cf547da7b3 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
@@ -11,9 +11,8 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
index 1fdc5eb726d69..faf9e1a18ee1b 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<class T, class charT = char>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
index ce1c0c93130b6..c440b1ac2b168 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<class T, class charT = char>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
index c399a81f2a041..0b0e3a16c4109 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<class T, class charT = char>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
index 192ddcd16c44a..c63cc52403f45 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<class T, class charT = char>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
index 10a330f0baf1e..52ac58a726651 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<class T, class charT = char>
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
index 75791fb945bcf..f5853f98ca97f 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
@@ -11,9 +11,9 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
index 58685f956be12..5967d8630e065 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
@@ -8,6 +8,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-has-no-incomplete-format
 
+// TODO FMT This test should not require std::to_chars(floating-point)
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
+
 #include <format>
 
 #include <utility>
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
index 9445ddb517cad..2cca15a6d5dc8 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
@@ -10,9 +10,8 @@
 // TODO FMT Evaluate gcc-12 status
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
+// This test requires std::to_chars(floating-point), which is in the dylib
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
index 017201481fa54..2d2e60cc20dd9 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<class charT, formattable<charT>... Ts>
@@ -61,7 +57,7 @@ void test() {
   test(SV("(1)"), std::tuple<int>{1});
   test(SV("(1, 1)"), std::tuple<int, CharT>{1, CharT('1')});
   test(SV("(1, 1)"), std::pair<int, CharT>{1, CharT('1')});
-  test(SV("(1, 1, 1)"), std::tuple<int, CharT, double>{1, CharT('1'), 1.0});
+  test(SV("(1, 1, true)"), std::tuple<int, CharT, bool>{1, CharT('1'), true});
 }
 
 void test() {
diff --git a/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
index 05c90557cd54f..f38c9fad2df92 100644
--- a/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<class charT, formattable<charT>... Ts>
@@ -64,7 +60,7 @@ constexpr void test() {
   test<CharT, std::tuple<int>>();
   test<CharT, std::tuple<int, CharT>>();
   test<CharT, std::pair<int, CharT>>();
-  test<CharT, std::tuple<int, CharT, double>>();
+  test<CharT, std::tuple<int, CharT, bool>>();
 }
 
 constexpr bool test() {
diff --git a/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
index 74af4f32fcf1d..63efdb077502e 100644
--- a/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // template<class charT, formattable<charT>... Ts>
@@ -49,7 +45,7 @@ constexpr void test() {
   test<CharT, std::tuple<int>>();
   test<CharT, std::tuple<int, CharT>>();
   test<CharT, std::pair<int, CharT>>();
-  test<CharT, std::tuple<int, CharT, double>>();
+  test<CharT, std::tuple<int, CharT, bool>>();
 }
 
 constexpr bool test() {
diff --git a/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
index 0258ae215ed22..92a77b3357316 100644
--- a/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
@@ -11,10 +11,6 @@
 // TODO FMT Fix this test using GCC, it currently times out.
 // UNSUPPORTED: gcc-12
 
-// This test requires the dylib support introduced in D92214.
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{.+}}
-// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx11.{{.+}}
-
 // <format>
 
 // class range_formatter
@@ -49,7 +45,7 @@ constexpr void test() {
   test<CharT, std::tuple<int>>();
   test<CharT, std::tuple<int, CharT>>();
   test<CharT, std::pair<int, CharT>>();
-  test<CharT, std::tuple<int, CharT, double>>();
+  test<CharT, std::tuple<int, CharT, bool>>();
 }
 
 constexpr bool test() {
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 8ff6decb0a344..6591a8edb9e16 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -492,9 +492,6 @@ apple-system-backdeployment-assertions-*)
     PARAMS+=";unwind_runtime_root=${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}"
     PARAMS+=";use_system_cxx_lib=True"
     PARAMS+=";enable_assertions=True"
-    # TODO: Enable experimental features during back-deployment -- right now some of the availability
-    #       annotations are incorrect, leading to test failures that could be avoided.
-    PARAMS+=";enable_experimental=False"
 
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \
                    -DLIBCXX_TEST_CONFIG="apple-libc++-backdeployment.cfg.in" \
@@ -533,9 +530,6 @@ apple-system-backdeployment-*)
     PARAMS+=";abi_runtime_root=${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}"
     PARAMS+=";unwind_runtime_root=${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}"
     PARAMS+=";use_system_cxx_lib=True"
-    # TODO: Enable experimental features during back-deployment -- right now some of the availability
-    #       annotations are incorrect, leading to test failures that could be avoided.
-    PARAMS+=";enable_experimental=False"
 
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \
                    -DLIBCXX_TEST_CONFIG="apple-libc++-backdeployment.cfg.in" \

From 0433abc8e03737982fe103bf1563393e738c1ba0 Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris@google.com>
Date: Fri, 17 Mar 2023 15:25:48 -0700
Subject: [PATCH 038/208] [docs][scudo] Add information about M_PURGE_ALL.

Add information about M_PURGE_ALL

Reviewed By: Chia-hungDuan

Differential Revision: https://reviews.llvm.org/D146336
---
 llvm/docs/ScudoHardenedAllocator.rst | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/ScudoHardenedAllocator.rst b/llvm/docs/ScudoHardenedAllocator.rst
index 9c1cfa6edcd63..875d018c4d9ff 100644
--- a/llvm/docs/ScudoHardenedAllocator.rst
+++ b/llvm/docs/ScudoHardenedAllocator.rst
@@ -265,7 +265,16 @@ The following "mallopt" options are available (options are defined in
 |                           | the interval to the minimum and maximum value as      |
 |                           | specified at compile time).                           |
 +---------------------------+-------------------------------------------------------+
-| M_PURGE                   | Forces immediate memory reclaiming (value is unused). |
+| M_PURGE                   | Forces immediate memory reclaiming but does not       |
+|                           | reclaim everything. For smaller size classes, there   |
+|                           | is still some memory that is not reclaimed due to the |
+|                           | extra time it takes and the small amount of memory    |
+|                           | that can be reclaimed.                                |
+|                           | The value is ignored.                                 |
++---------------------------+-------------------------------------------------------+
+| M_PURGE_ALL               | Same as M_PURGE but will force release all possible   |
+|                           | memory regardless of how long it takes.               |
+|                           | The value is ignored.                                 |
 +---------------------------+-------------------------------------------------------+
 | M_MEMTAG_TUNING           | Tunes the allocator's choice of memory tags to make   |
 |                           | it more likely that a certain class of memory errors  |

From 22293a3d85e9b3c410269b7adab1a60f5dcb3aa4 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough@gmail.com>
Date: Wed, 22 Mar 2023 15:49:50 -0500
Subject: [PATCH 039/208] Revert "[flang] Feature list plugin" due to failing
 build

This reverts commit 823ddba1b325f30fc3fb2e9d695c211b856a4d5d.
---
 flang/examples/CMakeLists.txt                 |   1 -
 flang/examples/FeatureList/CMakeLists.txt     |   9 -
 flang/examples/FeatureList/FeatureList.cpp    | 761 ------------------
 flang/test/CMakeLists.txt                     |   1 -
 flang/test/Examples/feature-list-class.f90    |  88 --
 .../test/Examples/feature-list-functions.f90  |  76 --
 6 files changed, 936 deletions(-)
 delete mode 100644 flang/examples/FeatureList/CMakeLists.txt
 delete mode 100644 flang/examples/FeatureList/FeatureList.cpp
 delete mode 100644 flang/test/Examples/feature-list-class.f90
 delete mode 100644 flang/test/Examples/feature-list-functions.f90

diff --git a/flang/examples/CMakeLists.txt b/flang/examples/CMakeLists.txt
index 8cc66ddbbbb0e..23fea3920efb6 100644
--- a/flang/examples/CMakeLists.txt
+++ b/flang/examples/CMakeLists.txt
@@ -1,4 +1,3 @@
 add_subdirectory(ExternalHelloWorld)
 add_subdirectory(PrintFlangFunctionNames)
 add_subdirectory(FlangOmpReport)
-add_subdirectory(FeatureList)
diff --git a/flang/examples/FeatureList/CMakeLists.txt b/flang/examples/FeatureList/CMakeLists.txt
deleted file mode 100644
index e17a7bebbff05..0000000000000
--- a/flang/examples/FeatureList/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-add_llvm_example_library(flangFeatureList
-    MODULE
-    FeatureList.cpp
-
-    DEPENDS
-    acc_gen
-    flangFrontend
-    omp_gen
-)
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
deleted file mode 100644
index 7d7e63e148bc0..0000000000000
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ /dev/null
@@ -1,761 +0,0 @@
-//===-- FeatureList.cpp ---------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A plugin that counts the amount of times a particular parse tree node
-// occurs.  This plugin should cover each feature covered in dump-parse-tree.h
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang/Frontend/FrontendActions.h"
-#include "flang/Frontend/FrontendPluginRegistry.h"
-#include "flang/Parser/parse-tree-visitor.h"
-#include "flang/Parser/parse-tree.h"
-#include "flang/Parser/parsing.h"
-
-#include <map>
-
-using namespace Fortran::frontend;
-using namespace Fortran::parser;
-using namespace Fortran;
-
-#define READ_FEATURE_CUST(classname, n) \
-  bool Pre(const classname &) { \
-    record(#n); \
-    return true; \
-  } \
-  void Post(const classname &) {}
-
-#define READ_FEATURE(classname) READ_FEATURE_CUST(classname, classname)
-
-struct NodeVisitor {
-private:
-  std::map<const char *, unsigned int> frequencies;
-
-  void record(const char *name) {
-    const auto [it, ins] = frequencies.insert({name, 1});
-    if (!ins) {
-      frequencies[name] = it->second + 1;
-    }
-  }
-
-public:
-  const std::map<const char *, unsigned int> &getFrequencies() const {
-    return frequencies;
-  }
-
-  READ_FEATURE_CUST(format::ControlEditDesc, ControlEditDesc)
-  READ_FEATURE_CUST(format::DerivedTypeDataEditDesc, DerivedTypeDataEditDesc)
-  READ_FEATURE_CUST(format::FormatItem, FormatItem)
-  READ_FEATURE_CUST(format::FormatSpecification, FormatSpecification)
-  READ_FEATURE_CUST(
-      format::IntrinsicTypeDataEditDesc, IntrinsicTypeDataEditDesc)
-  READ_FEATURE(Abstract)
-  READ_FEATURE(AccAtomicCapture)
-  READ_FEATURE(AccAtomicCapture::Stmt1)
-  READ_FEATURE(AccAtomicCapture::Stmt2)
-  READ_FEATURE(AccAtomicRead)
-  READ_FEATURE(AccAtomicUpdate)
-  READ_FEATURE(AccAtomicWrite)
-  READ_FEATURE(AccBeginBlockDirective)
-  READ_FEATURE(AccBeginCombinedDirective)
-  READ_FEATURE(AccBeginLoopDirective)
-  READ_FEATURE(AccBlockDirective)
-  READ_FEATURE(AccClause)
-  READ_FEATURE(AccBindClause)
-  READ_FEATURE(AccDefaultClause)
-  READ_FEATURE(AccClauseList)
-  READ_FEATURE(AccCombinedDirective)
-  READ_FEATURE(AccDataModifier)
-  READ_FEATURE(AccDataModifier::Modifier)
-  READ_FEATURE(AccDeclarativeDirective)
-  READ_FEATURE(AccEndAtomic)
-  READ_FEATURE(AccEndBlockDirective)
-  READ_FEATURE(AccEndCombinedDirective)
-  READ_FEATURE(AccGangArgument)
-  READ_FEATURE(AccObject)
-  READ_FEATURE(AccObjectList)
-  READ_FEATURE(AccObjectListWithModifier)
-  READ_FEATURE(AccObjectListWithReduction)
-  READ_FEATURE(AccReductionOperator)
-  READ_FEATURE(AccReductionOperator::Operator)
-  READ_FEATURE(AccSizeExpr)
-  READ_FEATURE(AccSizeExprList)
-  READ_FEATURE(AccSelfClause)
-  READ_FEATURE(AccStandaloneDirective)
-  READ_FEATURE(AccDeviceTypeExpr)
-  READ_FEATURE(AccDeviceTypeExprList)
-  READ_FEATURE(AccTileExpr)
-  READ_FEATURE(AccTileExprList)
-  READ_FEATURE(AccLoopDirective)
-  READ_FEATURE(AccWaitArgument)
-  READ_FEATURE(AcImpliedDo)
-  READ_FEATURE(AcImpliedDoControl)
-  READ_FEATURE(AcValue)
-  READ_FEATURE(AccessStmt)
-  READ_FEATURE(AccessId)
-  READ_FEATURE(AccessSpec)
-  READ_FEATURE(AccessSpec::Kind)
-  READ_FEATURE(AcSpec)
-  READ_FEATURE(ActionStmt)
-  READ_FEATURE(ActualArg)
-  READ_FEATURE(ActualArg::PercentRef)
-  READ_FEATURE(ActualArg::PercentVal)
-  READ_FEATURE(ActualArgSpec)
-  READ_FEATURE(AcValue::Triplet)
-  READ_FEATURE(AllocOpt)
-  READ_FEATURE(AllocOpt::Mold)
-  READ_FEATURE(AllocOpt::Source)
-  READ_FEATURE(Allocatable)
-  READ_FEATURE(AllocatableStmt)
-  READ_FEATURE(AllocateCoarraySpec)
-  READ_FEATURE(AllocateObject)
-  READ_FEATURE(AllocateShapeSpec)
-  READ_FEATURE(AllocateStmt)
-  READ_FEATURE(Allocation)
-  READ_FEATURE(AltReturnSpec)
-  READ_FEATURE(ArithmeticIfStmt)
-  READ_FEATURE(ArrayConstructor)
-  READ_FEATURE(ArrayElement)
-  READ_FEATURE(ArraySpec)
-  READ_FEATURE(AssignStmt)
-  READ_FEATURE(AssignedGotoStmt)
-  READ_FEATURE(AssignmentStmt)
-  READ_FEATURE(AssociateConstruct)
-  READ_FEATURE(AssociateStmt)
-  READ_FEATURE(Association)
-  READ_FEATURE(AssumedImpliedSpec)
-  READ_FEATURE(AssumedRankSpec)
-  READ_FEATURE(AssumedShapeSpec)
-  READ_FEATURE(AssumedSizeSpec)
-  READ_FEATURE(Asynchronous)
-  READ_FEATURE(AsynchronousStmt)
-  READ_FEATURE(AttrSpec)
-  READ_FEATURE(BOZLiteralConstant)
-  READ_FEATURE(BackspaceStmt)
-  READ_FEATURE(BasedPointer)
-  READ_FEATURE(BasedPointerStmt)
-  READ_FEATURE(BindAttr)
-  READ_FEATURE(BindAttr::Deferred)
-  READ_FEATURE(BindAttr::Non_Overridable)
-  READ_FEATURE(BindEntity)
-  READ_FEATURE(BindEntity::Kind)
-  READ_FEATURE(BindStmt)
-  READ_FEATURE(Block)
-  READ_FEATURE(BlockConstruct)
-  READ_FEATURE(BlockData)
-  READ_FEATURE(BlockDataStmt)
-  READ_FEATURE(BlockSpecificationPart)
-  READ_FEATURE(BlockStmt)
-  READ_FEATURE(BoundsRemapping)
-  READ_FEATURE(BoundsSpec)
-  READ_FEATURE(Call)
-  READ_FEATURE(CallStmt)
-  READ_FEATURE(CaseConstruct)
-  READ_FEATURE(CaseConstruct::Case)
-  READ_FEATURE(CaseSelector)
-  READ_FEATURE(CaseStmt)
-  READ_FEATURE(CaseValueRange)
-  READ_FEATURE(CaseValueRange::Range)
-  READ_FEATURE(ChangeTeamConstruct)
-  READ_FEATURE(ChangeTeamStmt)
-  READ_FEATURE(CharLength)
-  READ_FEATURE(CharLiteralConstant)
-  READ_FEATURE(CharLiteralConstantSubstring)
-  READ_FEATURE(CharSelector)
-  READ_FEATURE(CharSelector::LengthAndKind)
-  READ_FEATURE(CloseStmt)
-  READ_FEATURE(CloseStmt::CloseSpec)
-  READ_FEATURE(CoarrayAssociation)
-  READ_FEATURE(CoarraySpec)
-  READ_FEATURE(CodimensionDecl)
-  READ_FEATURE(CodimensionStmt)
-  READ_FEATURE(CoindexedNamedObject)
-  READ_FEATURE(CommonBlockObject)
-  READ_FEATURE(CommonStmt)
-  READ_FEATURE(CommonStmt::Block)
-  READ_FEATURE(CompilerDirective)
-  READ_FEATURE(CompilerDirective::IgnoreTKR)
-  READ_FEATURE(CompilerDirective::LoopCount)
-  READ_FEATURE(CompilerDirective::NameValue)
-  READ_FEATURE(ComplexLiteralConstant)
-  READ_FEATURE(ComplexPart)
-  READ_FEATURE(ComponentArraySpec)
-  READ_FEATURE(ComponentAttrSpec)
-  READ_FEATURE(ComponentDataSource)
-  READ_FEATURE(ComponentDecl)
-  READ_FEATURE(FillDecl)
-  READ_FEATURE(ComponentOrFill)
-  READ_FEATURE(ComponentDefStmt)
-  READ_FEATURE(ComponentSpec)
-  READ_FEATURE(ComputedGotoStmt)
-  READ_FEATURE(ConcurrentControl)
-  READ_FEATURE(ConcurrentHeader)
-  READ_FEATURE(ConnectSpec)
-  READ_FEATURE(ConnectSpec::CharExpr)
-  READ_FEATURE(ConnectSpec::CharExpr::Kind)
-  READ_FEATURE(ConnectSpec::Newunit)
-  READ_FEATURE(ConnectSpec::Recl)
-  READ_FEATURE(ContainsStmt)
-  READ_FEATURE(Contiguous)
-  READ_FEATURE(ContiguousStmt)
-  READ_FEATURE(ContinueStmt)
-  READ_FEATURE(CriticalConstruct)
-  READ_FEATURE(CriticalStmt)
-  READ_FEATURE(CycleStmt)
-  READ_FEATURE(DataComponentDefStmt)
-  READ_FEATURE(DataIDoObject)
-  READ_FEATURE(DataImpliedDo)
-  READ_FEATURE(DataRef)
-  READ_FEATURE(DataStmt)
-  READ_FEATURE(DataStmtConstant)
-  READ_FEATURE(DataStmtObject)
-  READ_FEATURE(DataStmtRepeat)
-  READ_FEATURE(DataStmtSet)
-  READ_FEATURE(DataStmtValue)
-  READ_FEATURE(DeallocateStmt)
-  READ_FEATURE(DeclarationConstruct)
-  READ_FEATURE(DeclarationTypeSpec)
-  READ_FEATURE(DeclarationTypeSpec::Class)
-  READ_FEATURE(DeclarationTypeSpec::ClassStar)
-  READ_FEATURE(DeclarationTypeSpec::Record)
-  READ_FEATURE(DeclarationTypeSpec::Type)
-  READ_FEATURE(DeclarationTypeSpec::TypeStar)
-  READ_FEATURE(Default)
-  READ_FEATURE(DeferredCoshapeSpecList)
-  READ_FEATURE(DeferredShapeSpecList)
-  READ_FEATURE(DefinedOpName)
-  READ_FEATURE(DefinedOperator)
-  READ_FEATURE(DefinedOperator::IntrinsicOperator)
-  READ_FEATURE(DerivedTypeDef)
-  READ_FEATURE(DerivedTypeSpec)
-  READ_FEATURE(DerivedTypeStmt)
-  READ_FEATURE(Designator)
-  READ_FEATURE(DimensionStmt)
-  READ_FEATURE(DimensionStmt::Declaration)
-  READ_FEATURE(DoConstruct)
-  READ_FEATURE(DummyArg)
-  READ_FEATURE(ElseIfStmt)
-  READ_FEATURE(ElseStmt)
-  READ_FEATURE(ElsewhereStmt)
-  READ_FEATURE(EndAssociateStmt)
-  READ_FEATURE(EndBlockDataStmt)
-  READ_FEATURE(EndBlockStmt)
-  READ_FEATURE(EndChangeTeamStmt)
-  READ_FEATURE(EndCriticalStmt)
-  READ_FEATURE(EndDoStmt)
-  READ_FEATURE(EndEnumStmt)
-  READ_FEATURE(EndForallStmt)
-  READ_FEATURE(EndFunctionStmt)
-  READ_FEATURE(EndIfStmt)
-  READ_FEATURE(EndInterfaceStmt)
-  READ_FEATURE(EndLabel)
-  READ_FEATURE(EndModuleStmt)
-  READ_FEATURE(EndMpSubprogramStmt)
-  READ_FEATURE(EndProgramStmt)
-  READ_FEATURE(EndSelectStmt)
-  READ_FEATURE(EndSubmoduleStmt)
-  READ_FEATURE(EndSubroutineStmt)
-  READ_FEATURE(EndTypeStmt)
-  READ_FEATURE(EndWhereStmt)
-  READ_FEATURE(EndfileStmt)
-  READ_FEATURE(EntityDecl)
-  READ_FEATURE(EntryStmt)
-  READ_FEATURE(EnumDef)
-  READ_FEATURE(EnumDefStmt)
-  READ_FEATURE(Enumerator)
-  READ_FEATURE(EnumeratorDefStmt)
-  READ_FEATURE(EorLabel)
-  READ_FEATURE(EquivalenceObject)
-  READ_FEATURE(EquivalenceStmt)
-  READ_FEATURE(ErrLabel)
-  READ_FEATURE(ErrorRecovery)
-  READ_FEATURE(EventPostStmt)
-  READ_FEATURE(EventWaitStmt)
-  READ_FEATURE(EventWaitStmt::EventWaitSpec)
-  READ_FEATURE(ExecutableConstruct)
-  READ_FEATURE(ExecutionPart)
-  READ_FEATURE(ExecutionPartConstruct)
-  READ_FEATURE(ExitStmt)
-  READ_FEATURE(ExplicitCoshapeSpec)
-  READ_FEATURE(ExplicitShapeSpec)
-  READ_FEATURE(Expr)
-  READ_FEATURE(Expr::Parentheses)
-  READ_FEATURE(Expr::UnaryPlus)
-  READ_FEATURE(Expr::Negate)
-  READ_FEATURE(Expr::NOT)
-  READ_FEATURE(Expr::PercentLoc)
-  READ_FEATURE(Expr::DefinedUnary)
-  READ_FEATURE(Expr::Power)
-  READ_FEATURE(Expr::Multiply)
-  READ_FEATURE(Expr::Divide)
-  READ_FEATURE(Expr::Add)
-  READ_FEATURE(Expr::Subtract)
-  READ_FEATURE(Expr::Concat)
-  READ_FEATURE(Expr::LT)
-  READ_FEATURE(Expr::LE)
-  READ_FEATURE(Expr::EQ)
-  READ_FEATURE(Expr::NE)
-  READ_FEATURE(Expr::GE)
-  READ_FEATURE(Expr::GT)
-  READ_FEATURE(Expr::AND)
-  READ_FEATURE(Expr::OR)
-  READ_FEATURE(Expr::EQV)
-  READ_FEATURE(Expr::NEQV)
-  READ_FEATURE(Expr::DefinedBinary)
-  READ_FEATURE(Expr::ComplexConstructor)
-  READ_FEATURE(External)
-  READ_FEATURE(ExternalStmt)
-  READ_FEATURE(FailImageStmt)
-  READ_FEATURE(FileUnitNumber)
-  READ_FEATURE(FinalProcedureStmt)
-  READ_FEATURE(FlushStmt)
-  READ_FEATURE(ForallAssignmentStmt)
-  READ_FEATURE(ForallBodyConstruct)
-  READ_FEATURE(ForallConstruct)
-  READ_FEATURE(ForallConstructStmt)
-  READ_FEATURE(ForallStmt)
-  READ_FEATURE(FormTeamStmt)
-  READ_FEATURE(FormTeamStmt::FormTeamSpec)
-  READ_FEATURE(Format)
-  READ_FEATURE(FormatStmt)
-  READ_FEATURE(FunctionReference)
-  READ_FEATURE(FunctionStmt)
-  READ_FEATURE(FunctionSubprogram)
-  READ_FEATURE(GenericSpec)
-  READ_FEATURE(GenericSpec::Assignment)
-  READ_FEATURE(GenericSpec::ReadFormatted)
-  READ_FEATURE(GenericSpec::ReadUnformatted)
-  READ_FEATURE(GenericSpec::WriteFormatted)
-  READ_FEATURE(GenericSpec::WriteUnformatted)
-  READ_FEATURE(GenericStmt)
-  READ_FEATURE(GotoStmt)
-  READ_FEATURE(HollerithLiteralConstant)
-  READ_FEATURE(IdExpr)
-  READ_FEATURE(IdVariable)
-  READ_FEATURE(IfConstruct)
-  READ_FEATURE(IfConstruct::ElseBlock)
-  READ_FEATURE(IfConstruct::ElseIfBlock)
-  READ_FEATURE(IfStmt)
-  READ_FEATURE(IfThenStmt)
-  READ_FEATURE(TeamValue)
-  READ_FEATURE(ImageSelector)
-  READ_FEATURE(ImageSelectorSpec)
-  READ_FEATURE(ImageSelectorSpec::Stat)
-  READ_FEATURE(ImageSelectorSpec::Team_Number)
-  READ_FEATURE(ImplicitPart)
-  READ_FEATURE(ImplicitPartStmt)
-  READ_FEATURE(ImplicitSpec)
-  READ_FEATURE(ImplicitStmt)
-  READ_FEATURE(ImplicitStmt::ImplicitNoneNameSpec)
-  READ_FEATURE(ImpliedShapeSpec)
-  READ_FEATURE(ImportStmt)
-  READ_FEATURE(Initialization)
-  READ_FEATURE(InputImpliedDo)
-  READ_FEATURE(InputItem)
-  READ_FEATURE(InquireSpec)
-  READ_FEATURE(InquireSpec::CharVar)
-  READ_FEATURE(InquireSpec::CharVar::Kind)
-  READ_FEATURE(InquireSpec::IntVar)
-  READ_FEATURE(InquireSpec::IntVar::Kind)
-  READ_FEATURE(InquireSpec::LogVar)
-  READ_FEATURE(InquireSpec::LogVar::Kind)
-  READ_FEATURE(InquireStmt)
-  READ_FEATURE(InquireStmt::Iolength)
-  READ_FEATURE(IntegerTypeSpec)
-  READ_FEATURE(IntentSpec)
-  READ_FEATURE(IntentSpec::Intent)
-  READ_FEATURE(IntentStmt)
-  READ_FEATURE(InterfaceBlock)
-  READ_FEATURE(InterfaceBody)
-  READ_FEATURE(InterfaceBody::Function)
-  READ_FEATURE(InterfaceBody::Subroutine)
-  READ_FEATURE(InterfaceSpecification)
-  READ_FEATURE(InterfaceStmt)
-  READ_FEATURE(InternalSubprogram)
-  READ_FEATURE(InternalSubprogramPart)
-  READ_FEATURE(Intrinsic)
-  READ_FEATURE(IntrinsicStmt)
-  READ_FEATURE(IntrinsicTypeSpec)
-  READ_FEATURE(IntrinsicTypeSpec::Character)
-  READ_FEATURE(IntrinsicTypeSpec::Complex)
-  READ_FEATURE(IntrinsicTypeSpec::DoubleComplex)
-  READ_FEATURE(IntrinsicTypeSpec::DoublePrecision)
-  READ_FEATURE(IntrinsicTypeSpec::Logical)
-  READ_FEATURE(IntrinsicTypeSpec::Real)
-  READ_FEATURE(IoControlSpec)
-  READ_FEATURE(IoControlSpec::Asynchronous)
-  READ_FEATURE(IoControlSpec::CharExpr)
-  READ_FEATURE(IoControlSpec::CharExpr::Kind)
-  READ_FEATURE(IoControlSpec::Pos)
-  READ_FEATURE(IoControlSpec::Rec)
-  READ_FEATURE(IoControlSpec::Size)
-  READ_FEATURE(IoUnit)
-  READ_FEATURE(Keyword)
-  READ_FEATURE(KindParam)
-  READ_FEATURE(KindSelector)
-  READ_FEATURE(KindSelector::StarSize)
-  READ_FEATURE(LabelDoStmt)
-  READ_FEATURE(LanguageBindingSpec)
-  READ_FEATURE(LengthSelector)
-  READ_FEATURE(LetterSpec)
-  READ_FEATURE(LiteralConstant)
-  READ_FEATURE(IntLiteralConstant)
-  READ_FEATURE(LocalitySpec)
-  READ_FEATURE(LocalitySpec::DefaultNone)
-  READ_FEATURE(LocalitySpec::Local)
-  READ_FEATURE(LocalitySpec::LocalInit)
-  READ_FEATURE(LocalitySpec::Shared)
-  READ_FEATURE(LockStmt)
-  READ_FEATURE(LockStmt::LockStat)
-  READ_FEATURE(LogicalLiteralConstant)
-  READ_FEATURE(LoopControl)
-  READ_FEATURE(LoopControl::Concurrent)
-  READ_FEATURE(MainProgram)
-  READ_FEATURE(Map)
-  READ_FEATURE(Map::EndMapStmt)
-  READ_FEATURE(Map::MapStmt)
-  READ_FEATURE(MaskedElsewhereStmt)
-  READ_FEATURE(Module)
-  READ_FEATURE(ModuleStmt)
-  READ_FEATURE(ModuleSubprogram)
-  READ_FEATURE(ModuleSubprogramPart)
-  READ_FEATURE(MpSubprogramStmt)
-  READ_FEATURE(MsgVariable)
-  READ_FEATURE(Name)
-  READ_FEATURE(NamedConstant)
-  READ_FEATURE(NamedConstantDef)
-  READ_FEATURE(NamelistStmt)
-  READ_FEATURE(NamelistStmt::Group)
-  READ_FEATURE(NonLabelDoStmt)
-  READ_FEATURE(NoPass)
-  READ_FEATURE(NullifyStmt)
-  READ_FEATURE(NullInit)
-  READ_FEATURE(ObjectDecl)
-  READ_FEATURE(OldParameterStmt)
-  READ_FEATURE(OmpAlignedClause)
-  READ_FEATURE(OmpAtomic)
-  READ_FEATURE(OmpAtomicCapture)
-  READ_FEATURE(OmpAtomicCapture::Stmt1)
-  READ_FEATURE(OmpAtomicCapture::Stmt2)
-  READ_FEATURE(OmpAtomicRead)
-  READ_FEATURE(OmpAtomicUpdate)
-  READ_FEATURE(OmpAtomicWrite)
-  READ_FEATURE(OmpBeginBlockDirective)
-  READ_FEATURE(OmpBeginLoopDirective)
-  READ_FEATURE(OmpBeginSectionsDirective)
-  READ_FEATURE(OmpBlockDirective)
-  READ_FEATURE(OmpCancelType)
-  READ_FEATURE(OmpCancelType::Type)
-  READ_FEATURE(OmpClause)
-  READ_FEATURE(OmpClauseList)
-  READ_FEATURE(OmpCriticalDirective)
-  READ_FEATURE(OmpDeclareTargetSpecifier)
-  READ_FEATURE(OmpDeclareTargetWithClause)
-  READ_FEATURE(OmpDeclareTargetWithList)
-  READ_FEATURE(OmpDefaultClause)
-  READ_FEATURE(OmpDefaultClause::Type)
-  READ_FEATURE(OmpDefaultmapClause)
-  READ_FEATURE(OmpDefaultmapClause::ImplicitBehavior)
-  READ_FEATURE(OmpDefaultmapClause::VariableCategory)
-  READ_FEATURE(OmpDependClause)
-  READ_FEATURE(OmpDependClause::InOut)
-  READ_FEATURE(OmpDependClause::Sink)
-  READ_FEATURE(OmpDependClause::Source)
-  READ_FEATURE(OmpDependenceType)
-  READ_FEATURE(OmpDependenceType::Type)
-  READ_FEATURE(OmpDependSinkVec)
-  READ_FEATURE(OmpDependSinkVecLength)
-  READ_FEATURE(OmpEndAtomic)
-  READ_FEATURE(OmpEndBlockDirective)
-  READ_FEATURE(OmpEndCriticalDirective)
-  READ_FEATURE(OmpEndLoopDirective)
-  READ_FEATURE(OmpEndSectionsDirective)
-  READ_FEATURE(OmpIfClause)
-  READ_FEATURE(OmpIfClause::DirectiveNameModifier)
-  READ_FEATURE(OmpLinearClause)
-  READ_FEATURE(OmpLinearClause::WithModifier)
-  READ_FEATURE(OmpLinearClause::WithoutModifier)
-  READ_FEATURE(OmpLinearModifier)
-  READ_FEATURE(OmpLinearModifier::Type)
-  READ_FEATURE(OmpLoopDirective)
-  READ_FEATURE(OmpMapClause)
-  READ_FEATURE(OmpMapType)
-  READ_FEATURE(OmpMapType::Always)
-  READ_FEATURE(OmpMapType::Type)
-  READ_FEATURE(OmpObject)
-  READ_FEATURE(OmpObjectList)
-  READ_FEATURE(OmpOrderClause)
-  READ_FEATURE(OmpOrderClause::Type)
-  READ_FEATURE(OmpOrderModifier)
-  READ_FEATURE(OmpOrderModifier::Kind)
-  READ_FEATURE(OmpProcBindClause)
-  READ_FEATURE(OmpProcBindClause::Type)
-  READ_FEATURE(OmpReductionClause)
-  READ_FEATURE(OmpInReductionClause)
-  READ_FEATURE(OmpReductionCombiner)
-  READ_FEATURE(OmpReductionCombiner::FunctionCombiner)
-  READ_FEATURE(OmpReductionInitializerClause)
-  READ_FEATURE(OmpReductionOperator)
-  READ_FEATURE(OmpAllocateClause)
-  READ_FEATURE(OmpAllocateClause::Allocator)
-  READ_FEATURE(OmpScheduleClause)
-  READ_FEATURE(OmpScheduleClause::ScheduleType)
-  READ_FEATURE(OmpDeviceClause)
-  READ_FEATURE(OmpDeviceClause::DeviceModifier)
-  READ_FEATURE(OmpDeviceTypeClause)
-  READ_FEATURE(OmpDeviceTypeClause::Type)
-  READ_FEATURE(OmpScheduleModifier)
-  READ_FEATURE(OmpScheduleModifier::Modifier1)
-  READ_FEATURE(OmpScheduleModifier::Modifier2)
-  READ_FEATURE(OmpScheduleModifierType)
-  READ_FEATURE(OmpScheduleModifierType::ModType)
-  READ_FEATURE(OmpSectionBlocks)
-  READ_FEATURE(OmpSectionsDirective)
-  READ_FEATURE(OmpSimpleStandaloneDirective)
-  READ_FEATURE(Only)
-  READ_FEATURE(OpenACCAtomicConstruct)
-  READ_FEATURE(OpenACCBlockConstruct)
-  READ_FEATURE(OpenACCCacheConstruct)
-  READ_FEATURE(OpenACCCombinedConstruct)
-  READ_FEATURE(OpenACCConstruct)
-  READ_FEATURE(OpenACCDeclarativeConstruct)
-  READ_FEATURE(OpenACCLoopConstruct)
-  READ_FEATURE(OpenACCRoutineConstruct)
-  READ_FEATURE(OpenACCStandaloneDeclarativeConstruct)
-  READ_FEATURE(OpenACCStandaloneConstruct)
-  READ_FEATURE(OpenACCWaitConstruct)
-  READ_FEATURE(OpenMPAtomicConstruct)
-  READ_FEATURE(OpenMPBlockConstruct)
-  READ_FEATURE(OpenMPCancelConstruct)
-  READ_FEATURE(OpenMPCancelConstruct::If)
-  READ_FEATURE(OpenMPCancellationPointConstruct)
-  READ_FEATURE(OpenMPConstruct)
-  READ_FEATURE(OpenMPCriticalConstruct)
-  READ_FEATURE(OpenMPDeclarativeAllocate)
-  READ_FEATURE(OpenMPDeclarativeConstruct)
-  READ_FEATURE(OpenMPDeclareReductionConstruct)
-  READ_FEATURE(OpenMPDeclareSimdConstruct)
-  READ_FEATURE(OpenMPDeclareTargetConstruct)
-  READ_FEATURE(OmpMemoryOrderClause)
-  READ_FEATURE(OmpAtomicClause)
-  READ_FEATURE(OmpAtomicClauseList)
-  READ_FEATURE(OmpAtomicDefaultMemOrderClause)
-  READ_FEATURE(OmpAtomicDefaultMemOrderClause::Type)
-  READ_FEATURE(OpenMPFlushConstruct)
-  READ_FEATURE(OpenMPLoopConstruct)
-  READ_FEATURE(OpenMPExecutableAllocate)
-  READ_FEATURE(OpenMPRequiresConstruct)
-  READ_FEATURE(OpenMPSimpleStandaloneConstruct)
-  READ_FEATURE(OpenMPStandaloneConstruct)
-  READ_FEATURE(OpenMPSectionConstruct)
-  READ_FEATURE(OpenMPSectionsConstruct)
-  READ_FEATURE(OpenMPThreadprivate)
-  READ_FEATURE(OpenStmt)
-  READ_FEATURE(Optional)
-  READ_FEATURE(OptionalStmt)
-  READ_FEATURE(OtherSpecificationStmt)
-  READ_FEATURE(OutputImpliedDo)
-  READ_FEATURE(OutputItem)
-  READ_FEATURE(Parameter)
-  READ_FEATURE(ParameterStmt)
-  READ_FEATURE(ParentIdentifier)
-  READ_FEATURE(Pass)
-  READ_FEATURE(PauseStmt)
-  READ_FEATURE(Pointer)
-  READ_FEATURE(PointerAssignmentStmt)
-  READ_FEATURE(PointerAssignmentStmt::Bounds)
-  READ_FEATURE(PointerDecl)
-  READ_FEATURE(PointerObject)
-  READ_FEATURE(PointerStmt)
-  READ_FEATURE(PositionOrFlushSpec)
-  READ_FEATURE(PrefixSpec)
-  READ_FEATURE(PrefixSpec::Elemental)
-  READ_FEATURE(PrefixSpec::Impure)
-  READ_FEATURE(PrefixSpec::Module)
-  READ_FEATURE(PrefixSpec::Non_Recursive)
-  READ_FEATURE(PrefixSpec::Pure)
-  READ_FEATURE(PrefixSpec::Recursive)
-  READ_FEATURE(PrintStmt)
-  READ_FEATURE(PrivateStmt)
-  READ_FEATURE(PrivateOrSequence)
-  READ_FEATURE(ProcAttrSpec)
-  READ_FEATURE(ProcComponentAttrSpec)
-  READ_FEATURE(ProcComponentDefStmt)
-  READ_FEATURE(ProcComponentRef)
-  READ_FEATURE(ProcDecl)
-  READ_FEATURE(ProcInterface)
-  READ_FEATURE(ProcPointerInit)
-  READ_FEATURE(ProcedureDeclarationStmt)
-  READ_FEATURE(ProcedureDesignator)
-  READ_FEATURE(ProcedureStmt)
-  READ_FEATURE(ProcedureStmt::Kind)
-  READ_FEATURE(Program)
-  READ_FEATURE(ProgramStmt)
-  READ_FEATURE(ProgramUnit)
-  READ_FEATURE(Protected)
-  READ_FEATURE(ProtectedStmt)
-  READ_FEATURE(ReadStmt)
-  READ_FEATURE(RealLiteralConstant)
-  READ_FEATURE(RealLiteralConstant::Real)
-  READ_FEATURE(Rename)
-  READ_FEATURE(Rename::Names)
-  READ_FEATURE(Rename::Operators)
-  READ_FEATURE(ReturnStmt)
-  READ_FEATURE(RewindStmt)
-  READ_FEATURE(Save)
-  READ_FEATURE(SaveStmt)
-  READ_FEATURE(SavedEntity)
-  READ_FEATURE(SavedEntity::Kind)
-  READ_FEATURE(SectionSubscript)
-  READ_FEATURE(SelectCaseStmt)
-  READ_FEATURE(SelectRankCaseStmt)
-  READ_FEATURE(SelectRankCaseStmt::Rank)
-  READ_FEATURE(SelectRankConstruct)
-  READ_FEATURE(SelectRankConstruct::RankCase)
-  READ_FEATURE(SelectRankStmt)
-  READ_FEATURE(SelectTypeConstruct)
-  READ_FEATURE(SelectTypeConstruct::TypeCase)
-  READ_FEATURE(SelectTypeStmt)
-  READ_FEATURE(Selector)
-  READ_FEATURE(SeparateModuleSubprogram)
-  READ_FEATURE(SequenceStmt)
-  READ_FEATURE(Sign)
-  READ_FEATURE(SignedComplexLiteralConstant)
-  READ_FEATURE(SignedIntLiteralConstant)
-  READ_FEATURE(SignedRealLiteralConstant)
-  READ_FEATURE(SpecificationConstruct)
-  READ_FEATURE(SpecificationExpr)
-  READ_FEATURE(SpecificationPart)
-  READ_FEATURE(Star)
-  READ_FEATURE(StatOrErrmsg)
-  READ_FEATURE(StatVariable)
-  READ_FEATURE(StatusExpr)
-  READ_FEATURE(StmtFunctionStmt)
-  READ_FEATURE(StopCode)
-  READ_FEATURE(StopStmt)
-  READ_FEATURE(StopStmt::Kind)
-  READ_FEATURE(StructureComponent)
-  READ_FEATURE(StructureConstructor)
-  READ_FEATURE(StructureDef)
-  READ_FEATURE(StructureDef::EndStructureStmt)
-  READ_FEATURE(StructureField)
-  READ_FEATURE(StructureStmt)
-  READ_FEATURE(Submodule)
-  READ_FEATURE(SubmoduleStmt)
-  READ_FEATURE(SubroutineStmt)
-  READ_FEATURE(SubroutineSubprogram)
-  READ_FEATURE(SubscriptTriplet)
-  READ_FEATURE(Substring)
-  READ_FEATURE(SubstringInquiry)
-  READ_FEATURE(SubstringRange)
-  READ_FEATURE(Suffix)
-  READ_FEATURE(SyncAllStmt)
-  READ_FEATURE(SyncImagesStmt)
-  READ_FEATURE(SyncImagesStmt::ImageSet)
-  READ_FEATURE(SyncMemoryStmt)
-  READ_FEATURE(SyncTeamStmt)
-  READ_FEATURE(Target)
-  READ_FEATURE(TargetStmt)
-  READ_FEATURE(TypeAttrSpec)
-  READ_FEATURE(TypeAttrSpec::BindC)
-  READ_FEATURE(TypeAttrSpec::Extends)
-  READ_FEATURE(TypeBoundGenericStmt)
-  READ_FEATURE(TypeBoundProcBinding)
-  READ_FEATURE(TypeBoundProcDecl)
-  READ_FEATURE(TypeBoundProcedurePart)
-  READ_FEATURE(TypeBoundProcedureStmt)
-  READ_FEATURE(TypeBoundProcedureStmt::WithInterface)
-  READ_FEATURE(TypeBoundProcedureStmt::WithoutInterface)
-  READ_FEATURE(TypeDeclarationStmt)
-  READ_FEATURE(TypeGuardStmt)
-  READ_FEATURE(TypeGuardStmt::Guard)
-  READ_FEATURE(TypeParamDecl)
-  READ_FEATURE(TypeParamDefStmt)
-  READ_FEATURE(common::TypeParamAttr)
-  READ_FEATURE(TypeParamSpec)
-  READ_FEATURE(TypeParamValue)
-  READ_FEATURE(TypeParamValue::Deferred)
-  READ_FEATURE(TypeSpec)
-  READ_FEATURE(Union)
-  READ_FEATURE(Union::EndUnionStmt)
-  READ_FEATURE(Union::UnionStmt)
-  READ_FEATURE(UnlockStmt)
-  READ_FEATURE(UseStmt)
-  READ_FEATURE(UseStmt::ModuleNature)
-  READ_FEATURE(Value)
-  READ_FEATURE(ValueStmt)
-  READ_FEATURE(Variable)
-  READ_FEATURE(Verbatim)
-  READ_FEATURE(Volatile)
-  READ_FEATURE(VolatileStmt)
-  READ_FEATURE(WaitSpec)
-  READ_FEATURE(WaitStmt)
-  READ_FEATURE(WhereBodyConstruct)
-  READ_FEATURE(WhereConstruct)
-  READ_FEATURE(WhereConstruct::Elsewhere)
-  READ_FEATURE(WhereConstruct::MaskedElsewhere)
-  READ_FEATURE(WhereConstructStmt)
-  READ_FEATURE(WhereStmt)
-  READ_FEATURE(WriteStmt)
-
-  READ_FEATURE(llvm::omp::Directive)
-  READ_FEATURE(llvm::omp::Clause)
-  READ_FEATURE(llvm::acc::Directive)
-  READ_FEATURE(llvm::acc::DefaultValue)
-
-  template <typename A> bool Pre(const A &) { return true; }
-  template <typename A> void Post(const A &) {}
-
-  template <typename T> bool Pre(const Statement<T> &) { return true; }
-  template <typename T> void Post(const Statement<T> &) {}
-
-  template <typename T> bool Pre(const UnlabeledStatement<T> &) { return true; }
-  template <typename T> void Post(const UnlabeledStatement<T> &) {}
-
-  template <typename T> bool Pre(const common::Indirection<T> &) {
-    return true;
-  }
-  template <typename T> void Post(const common::Indirection<T> &) {}
-
-  template <typename A> bool Pre(const Scalar<A> &) { return true; }
-  template <typename A> void Post(const Scalar<A> &) {}
-
-  template <typename A> bool Pre(const Constant<A> &) { return true; }
-  template <typename A> void Post(const Constant<A> &) {}
-
-  template <typename A> bool Pre(const Integer<A> &) { return true; }
-  template <typename A> void Post(const Integer<A> &) {}
-
-  template <typename A> bool Pre(const Logical<A> &) { return true; }
-  template <typename A> void Post(const Logical<A> &) {}
-
-  template <typename A> bool Pre(const DefaultChar<A> &) { return true; }
-  template <typename A> void Post(const DefaultChar<A> &) {}
-
-  template <typename... A> bool Pre(const std::tuple<A...> &) { return true; }
-  template <typename... A> void Post(const std::tuple<A...> &) {}
-
-  template <typename... A> bool Pre(const std::variant<A...> &) { return true; }
-  template <typename... A> void Post(const std::variant<A...> &) {}
-};
-
-class FeatureListAction : public PluginParseTreeAction {
-  void executeAction() override {
-    NodeVisitor visitor;
-    Fortran::parser::Walk(getParsing().parseTree(), visitor);
-
-    for (auto const &[feature, frequency] : visitor.getFrequencies()) {
-      llvm::outs() << feature << ": " << frequency << "\n";
-    }
-  }
-
-  bool beginSourceFileAction() override { return runPrescan() && runParse(); }
-};
-
-static FrontendPluginRegistry::Add<FeatureListAction> X(
-    "feature-list", "List program features");
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index 7d96a72e5f36d..4de1036dfb52b 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -79,7 +79,6 @@ if (LLVM_BUILD_EXAMPLES)
   list(APPEND FLANG_TEST_DEPENDS
     flangPrintFunctionNames
     flangOmpReport
-    flangFeatureList
     )
 endif ()
 
diff --git a/flang/test/Examples/feature-list-class.f90 b/flang/test/Examples/feature-list-class.f90
deleted file mode 100644
index cba361b677f2a..0000000000000
--- a/flang/test/Examples/feature-list-class.f90
+++ /dev/null
@@ -1,88 +0,0 @@
-! UNSUPPORTED: system-windows
-! REQUIRES: plugins, shell, examples
-
-! RUN: %flang_fc1 -load %llvmshlibdir/flangFeatureList%pluginext \
-! RUN:            -plugin feature-list %s 2>&1 | FileCheck %s
-
-module list_features_test
-    implicit none
-
-    type :: test_class_1
-        integer :: a
-        real :: b
-    contains
-        procedure :: sum => sum_test_class_1
-        procedure :: set => set_values_test_class_1
-    end type
-contains
-    real function sum_test_class_1(self)
-        class(test_class_1), intent(in) :: self
-        sum_test_class_1 = self%a + self%b
-    end function
-
-    subroutine set_values_test_class_1(self, a, b)
-        class(test_class_1), intent(out) :: self
-        integer, intent(in) :: a, b
-        self%a = a
-        self%b = b
-    end subroutine
-end module list_features_test
-
-! CHECK: Name: 32
-! CHECK-NEXT: DerivedTypeSpec: 2
-! CHECK-NEXT: Expr::Add: 1
-! CHECK-NEXT: IntrinsicTypeSpec: 4
-! CHECK-NEXT: IntegerTypeSpec: 2
-! CHECK-NEXT: IntrinsicTypeSpec::Real: 2
-! CHECK-NEXT: DataRef: 11
-! CHECK-NEXT: StructureComponent: 4
-! CHECK-NEXT: Designator: 7
-! CHECK-NEXT: Expr: 5
-! CHECK-NEXT: Variable: 3
-! CHECK-NEXT: AttrSpec: 3
-! CHECK-NEXT: IntentSpec: 3
-! CHECK-NEXT: IntentSpec::Intent: 3
-! CHECK-NEXT: DummyArg: 3
-! CHECK-NEXT: DeclarationTypeSpec: 6
-! CHECK-NEXT: DeclarationTypeSpec::Class: 2
-! CHECK-NEXT: ImplicitStmt: 1
-! CHECK-NEXT: ImplicitPart: 3
-! CHECK-NEXT: ImplicitPartStmt: 1
-! CHECK-NEXT: PrefixSpec: 1
-! CHECK-NEXT: Module: 1
-! CHECK-NEXT: AssignmentStmt: 3
-! CHECK-NEXT: ActionStmt: 3
-! CHECK-NEXT: Block: 2
-! CHECK-NEXT: TypeBoundProcBinding: 2
-! CHECK-NEXT: TypeBoundProcedureStmt: 2
-! CHECK-NEXT: TypeBoundProcDecl: 2
-! CHECK-NEXT: TypeBoundProcedureStmt::WithoutInterface: 2
-! CHECK-NEXT: ComponentOrFill: 2
-! CHECK-NEXT: ComponentDecl: 2
-! CHECK-NEXT: DataComponentDefStmt: 2
-! CHECK-NEXT: ComponentDefStmt: 2
-! CHECK-NEXT: TypeBoundProcedurePart: 1
-! CHECK-NEXT: ContainsStmt: 2
-! CHECK-NEXT: EndTypeStmt: 1
-! CHECK-NEXT: DerivedTypeDef: 1
-! CHECK-NEXT: DerivedTypeStmt: 1
-! CHECK-NEXT: EntityDecl: 4
-! CHECK-NEXT: SpecificationConstruct: 4
-! CHECK-NEXT: TypeDeclarationStmt: 3
-! CHECK-NEXT: DeclarationConstruct: 4
-! CHECK-NEXT: EndFunctionStmt: 1
-! CHECK-NEXT: FunctionStmt: 1
-! CHECK-NEXT: EndSubroutineStmt: 1
-! CHECK-NEXT: SubroutineStmt: 1
-! CHECK-NEXT: ExecutionPartConstruct: 3
-! CHECK-NEXT: ExecutableConstruct: 3
-! CHECK-NEXT: SpecificationPart: 3
-! CHECK-NEXT: FunctionSubprogram: 1
-! CHECK-NEXT: ExecutionPart: 2
-! CHECK-NEXT: SubroutineSubprogram: 1
-! CHECK-NEXT: ModuleSubprogram: 2
-! CHECK-NEXT: ProgramUnit: 1
-! CHECK-NEXT: Program: 1
-! CHECK-NEXT: ModuleSubprogramPart: 1
-! CHECK-NEXT: EndModuleStmt: 1
-! CHECK-NEXT: ModuleStmt: 1
diff --git a/flang/test/Examples/feature-list-functions.f90 b/flang/test/Examples/feature-list-functions.f90
deleted file mode 100644
index a1913dda697c7..0000000000000
--- a/flang/test/Examples/feature-list-functions.f90
+++ /dev/null
@@ -1,76 +0,0 @@
-! UNSUPPORTED: system-windows
-! REQUIRES: plugins, shell, examples
-
-! RUN: %flang_fc1 -load %llvmshlibdir/flangFeatureList%pluginext \
-! RUN:            -plugin feature-list %s 2>&1 | FileCheck %s
-
-program list_features_test
-    implicit none
-    call test_sub(test_func(2, 3), 4)
-contains
-    subroutine test_sub(a, b)
-        integer, intent(in) :: a, b
-        print "(I0)", a + b
-    end subroutine
-
-    integer function test_func(a, b)
-        integer, intent(in) :: a, b
-        test_func = a * b
-    end function
-end program list_features_test
-
-! CHECK: Name: 19
-! CHECK-NEXT: IntLiteralConstant: 3
-! CHECK-NEXT: LiteralConstant: 4
-! CHECK-NEXT: CharLiteralConstant: 1
-! CHECK-NEXT: FunctionReference: 1
-! CHECK-NEXT: Call: 2
-! CHECK-NEXT: Expr::Multiply: 1
-! CHECK-NEXT: Expr::Add: 1
-! CHECK-NEXT: IntrinsicTypeSpec: 3
-! CHECK-NEXT: IntegerTypeSpec: 3
-! CHECK-NEXT: Format: 1
-! CHECK-NEXT: DataRef: 5
-! CHECK-NEXT: ProcedureDesignator: 2
-! CHECK-NEXT: Designator: 5
-! CHECK-NEXT: ActualArgSpec: 4
-! CHECK-NEXT: ActualArg: 4
-! CHECK-NEXT: Expr: 11
-! CHECK-NEXT: Variable: 1
-! CHECK-NEXT: AttrSpec: 2
-! CHECK-NEXT: IntentSpec: 2
-! CHECK-NEXT: IntentSpec::Intent: 2
-! CHECK-NEXT: DummyArg: 2
-! CHECK-NEXT: DeclarationTypeSpec: 3
-! CHECK-NEXT: ImplicitStmt: 1
-! CHECK-NEXT: ImplicitPart: 3
-! CHECK-NEXT: ImplicitPartStmt: 1
-! CHECK-NEXT: PrefixSpec: 1
-! CHECK-NEXT: OutputItem: 1
-! CHECK-NEXT: AssignmentStmt: 1
-! CHECK-NEXT: ActionStmt: 3
-! CHECK-NEXT: PrintStmt: 1
-! CHECK-NEXT: CallStmt: 1
-! CHECK-NEXT: Block: 3
-! CHECK-NEXT: ContainsStmt: 1
-! CHECK-NEXT: EntityDecl: 4
-! CHECK-NEXT: SpecificationConstruct: 2
-! CHECK-NEXT: TypeDeclarationStmt: 2
-! CHECK-NEXT: DeclarationConstruct: 2
-! CHECK-NEXT: EndFunctionStmt: 1
-! CHECK-NEXT: FunctionStmt: 1
-! CHECK-NEXT: EndSubroutineStmt: 1
-! CHECK-NEXT: SubroutineStmt: 1
-! CHECK-NEXT: ExecutionPartConstruct: 3
-! CHECK-NEXT: ExecutableConstruct: 3
-! CHECK-NEXT: SpecificationPart: 3
-! CHECK-NEXT: FunctionSubprogram: 1
-! CHECK-NEXT: ExecutionPart: 3
-! CHECK-NEXT: InternalSubprogramPart: 1
-! CHECK-NEXT: InternalSubprogram: 2
-! CHECK-NEXT: SubroutineSubprogram: 1
-! CHECK-NEXT: ProgramUnit: 1
-! CHECK-NEXT: MainProgram: 1
-! CHECK-NEXT: Program: 1
-! CHECK-NEXT: EndProgramStmt: 1
-! CHECK-NEXT: ProgramStmt: 1

From 473e9adb84c29764da51e4d2a995fce9b2e2ffa5 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 22 Mar 2023 14:01:39 -0700
Subject: [PATCH 040/208] [MSAN] Update vector load/store tests to use proper
 attribute

I had made a mistake when pre-committing the tests; caught in review of D146157.
---
 .../MemorySanitizer/vector-load-store.ll      | 215 +++++++++++++++---
 1 file changed, 180 insertions(+), 35 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
index 52c60e9b83b29..a2245763abbc7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
@@ -6,100 +6,210 @@
 target triple = "x86_64-unknown-linux-gnu"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @load.v1i32(ptr %p) sanitize_address {
+define void @load.v1i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @load.v1i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @load.v1i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0:![0-9]+]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <1 x i32>, ptr [[TMP7]], align 4
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @load.v1i32(
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <1 x i32>, ptr [[TMP4]], align 4
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 ; ORIGINS-NEXT:    ret void
 ;
   load <1 x i32>, ptr %p
   ret void
 }
 
-define void @load.v2i32(ptr %p) sanitize_address {
+define void @load.v2i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @load.v2i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @load.v2i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 8
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 8
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @load.v2i32(
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 8
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 8
 ; ORIGINS-NEXT:    ret void
 ;
   load <2 x i32>, ptr %p
   ret void
 }
 
-define void @load.v4i32(ptr %p) sanitize_address {
+define void @load.v4i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @load.v4i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @load.v4i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @load.v4i32(
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 16
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 16
 ; ORIGINS-NEXT:    ret void
 ;
   load <4 x i32>, ptr %p
   ret void
 }
 
-define void @load.v8i32(ptr %p) sanitize_address {
+define void @load.v8i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @load.v8i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @load.v8i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 32
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 32
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP7]], align 32
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @load.v8i32(
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 32
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 32
 ; ORIGINS-NEXT:    ret void
 ;
   load <8 x i32>, ptr %p
   ret void
 }
 
-define void @load.v16i32(ptr %p) sanitize_address {
+define void @load.v16i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @load.v16i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[P:%.*]], align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 64
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @load.v16i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[P:%.*]], align 64
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr [[P:%.*]], align 64
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @load.v16i32(
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[P:%.*]], align 64
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 64
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 64
 ; ORIGINS-NEXT:    ret void
 ;
   load <16 x i32>, ptr %p
@@ -107,7 +217,7 @@ define void @load.v16i32(ptr %p) sanitize_address {
 }
 
 
-define void @store.v1i32(ptr %p) sanitize_address {
+define void @store.v1i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @store.v1i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
@@ -118,11 +228,18 @@ define void @store.v1i32(ptr %p) sanitize_address {
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @store.v1i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
-; ADDR-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
-; ADDR-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; ADDR-NEXT:    store <1 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <1 x i32> zeroinitializer, ptr [[TMP6]], align 4
 ; ADDR-NEXT:    store <1 x i32> zeroinitializer, ptr [[P]], align 4
 ; ADDR-NEXT:    ret void
 ;
@@ -141,7 +258,7 @@ define void @store.v1i32(ptr %p) sanitize_address {
   ret void
 }
 
-define void @store.v2i32(ptr %p) sanitize_address {
+define void @store.v2i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @store.v2i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
@@ -152,11 +269,18 @@ define void @store.v2i32(ptr %p) sanitize_address {
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @store.v2i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
-; ADDR-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
-; ADDR-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; ADDR-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP3]], align 8
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP6]], align 8
 ; ADDR-NEXT:    store <2 x i32> zeroinitializer, ptr [[P]], align 8
 ; ADDR-NEXT:    ret void
 ;
@@ -175,7 +299,7 @@ define void @store.v2i32(ptr %p) sanitize_address {
   ret void
 }
 
-define void @store.v4i32(ptr %p) sanitize_address {
+define void @store.v4i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @store.v4i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
@@ -186,11 +310,18 @@ define void @store.v4i32(ptr %p) sanitize_address {
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @store.v4i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
-; ADDR-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
-; ADDR-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; ADDR-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP3]], align 16
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP6]], align 16
 ; ADDR-NEXT:    store <4 x i32> zeroinitializer, ptr [[P]], align 16
 ; ADDR-NEXT:    ret void
 ;
@@ -209,7 +340,7 @@ define void @store.v4i32(ptr %p) sanitize_address {
   ret void
 }
 
-define void @store.v8i32(ptr %p) sanitize_address {
+define void @store.v8i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @store.v8i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
@@ -220,11 +351,18 @@ define void @store.v8i32(ptr %p) sanitize_address {
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @store.v8i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
-; ADDR-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
-; ADDR-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; ADDR-NEXT:    store <8 x i32> zeroinitializer, ptr [[TMP3]], align 32
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <8 x i32> zeroinitializer, ptr [[TMP6]], align 32
 ; ADDR-NEXT:    store <8 x i32> zeroinitializer, ptr [[P]], align 32
 ; ADDR-NEXT:    ret void
 ;
@@ -243,7 +381,7 @@ define void @store.v8i32(ptr %p) sanitize_address {
   ret void
 }
 
-define void @store.v16i32(ptr %p) sanitize_address {
+define void @store.v16i32(ptr %p) sanitize_memory {
 ; CHECK-LABEL: @store.v16i32(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
@@ -254,11 +392,18 @@ define void @store.v16i32(ptr %p) sanitize_address {
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @store.v16i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
-; ADDR-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
-; ADDR-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; ADDR-NEXT:    store <16 x i32> zeroinitializer, ptr [[TMP3]], align 64
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <16 x i32> zeroinitializer, ptr [[TMP6]], align 64
 ; ADDR-NEXT:    store <16 x i32> zeroinitializer, ptr [[P]], align 64
 ; ADDR-NEXT:    ret void
 ;

From e73186796db97633332434da69c4e9057e460a59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Wed, 22 Mar 2023 22:11:26 +0100
Subject: [PATCH 041/208] [JITLink] Deterministic JITDylib symbol table dumps

Sort symbols before dumping so we get a deterministic order and can check them in tests.

Reviewed By: lhames

Differential Revision: https://reviews.llvm.org/D146658
---
 llvm/lib/ExecutionEngine/Orc/Core.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 82fa4bed914e6..9b6712818363e 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1438,16 +1438,23 @@ void JITDylib::dump(raw_ostream &OS) {
     OS << "Link order: " << LinkOrder << "\n"
        << "Symbol table:\n";
 
-    for (auto &KV : Symbols) {
+    // Sort symbols so we get a deterministic order and can check them in tests.
+    std::vector<std::pair<SymbolStringPtr, SymbolTableEntry *>> SymbolsSorted;
+    for (auto &KV : Symbols)
+      SymbolsSorted.emplace_back(KV.first, &KV.second);
+    std::sort(SymbolsSorted.begin(), SymbolsSorted.end(),
+              [](const auto &L, const auto &R) { return *L.first < *R.first; });
+
+    for (auto &KV : SymbolsSorted) {
       OS << "    \"" << *KV.first << "\": ";
-      if (auto Addr = KV.second.getAddress())
+      if (auto Addr = KV.second->getAddress())
         OS << Addr;
       else
         OS << "<not resolved> ";
 
-      OS << " " << KV.second.getFlags() << " " << KV.second.getState();
+      OS << " " << KV.second->getFlags() << " " << KV.second->getState();
 
-      if (KV.second.hasMaterializerAttached()) {
+      if (KV.second->hasMaterializerAttached()) {
         OS << " (Materializer ";
         auto I = UnmaterializedInfos.find(KV.first);
         assert(I != UnmaterializedInfos.end() &&

From d59a43fe2ad81f5c3918c9ef79a986955256f7ea Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Wed, 22 Mar 2023 17:39:27 +0100
Subject: [PATCH 042/208] [libc++] Qualifies intptr_t and uintptr_t.

This has been done using the following command

  find libcxx/test -type f -exec perl -pi -e 's|^([^/]+?)((?<!::)(?<!::u)u?intptr_t)|\1std::\2|' \{} \;

The std module doesn't export declarations in the global namespaace.
This is a preparation for that module.

Reviewed By: #libc, ldionne

Differential Revision: https://reviews.llvm.org/D146643
---
 .../integral_typedefs.pass.cpp                     |  4 ++--
 .../atomics/stdatomic.h.syn/types.compile.pass.cpp |  4 ++--
 libcxx/test/std/atomics/types.pass.cpp             |  4 ++--
 .../experimental/simd/simd.cons/generator.pass.cpp |  2 +-
 .../std/numerics/bit/bit.pow.two/bit_ceil.fail.cpp |  2 +-
 .../std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp |  6 +++---
 .../numerics/bit/bit.pow.two/bit_floor.pass.cpp    |  6 +++---
 .../numerics/bit/bit.pow.two/bit_width.pass.cpp    |  6 +++---
 .../bit/bit.pow.two/has_single_bit.pass.cpp        |  6 +++---
 .../numerics/bit/bitops.count/countl_one.pass.cpp  |  6 +++---
 .../numerics/bit/bitops.count/countl_zero.pass.cpp |  6 +++---
 .../numerics/bit/bitops.count/countr_one.pass.cpp  |  6 +++---
 .../numerics/bit/bitops.count/countr_zero.pass.cpp |  6 +++---
 .../numerics/bit/bitops.count/popcount.pass.cpp    |  6 +++---
 .../test/std/numerics/bit/bitops.rot/rotl.pass.cpp |  6 +++---
 .../test/std/numerics/bit/bitops.rot/rotr.pass.cpp |  6 +++---
 .../range.elements/iterator/ctor.other.pass.cpp    |  2 +-
 .../range.elements/sentinel/equality.pass.cpp      |  2 +-
 .../ranges/range.adaptors/range.elements/types.h   |  2 +-
 .../std/ranges/range.adaptors/range.split/types.h  |  2 +-
 .../range.take.while/sentinel/equality.pass.cpp    |  2 +-
 .../range.zip/iterator/arithmetic.pass.cpp         | 14 +++++++-------
 .../range.zip/iterator/compare.pass.cpp            |  2 +-
 .../range.zip/iterator/ctor.default.pass.cpp       |  2 +-
 .../iterator/member_types.compile.pass.cpp         |  8 ++++----
 .../range.zip/sentinel/minus.pass.cpp              |  2 +-
 .../std/ranges/range.adaptors/range.zip/types.h    |  4 ++--
 .../formatter.pointer.pass.cpp                     |  2 +-
 .../function.objects/unord.hash/integral.pass.cpp  |  4 ++--
 .../memory/temporary.buffer/overaligned.pass.cpp   |  2 +-
 .../allocate_shared_construct.pass.cpp             |  2 +-
 .../meta.unary.prop.query/alignment_of.pass.cpp    |  4 ++--
 32 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/libcxx/test/std/atomics/atomics.types.generic/integral_typedefs.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/integral_typedefs.pass.cpp
index 92bef95946979..5e52cdec28caf 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/integral_typedefs.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/integral_typedefs.pass.cpp
@@ -64,8 +64,8 @@ int main(int, char**)
     static_assert((std::is_same<std::atomic<char32_t>, std::atomic_char32_t>::value), "");
 
 //  Added by LWG 2441
-    static_assert((std::is_same<std::atomic<intptr_t>,  std::atomic_intptr_t>::value), "");
-    static_assert((std::is_same<std::atomic<uintptr_t>, std::atomic_uintptr_t>::value), "");
+    static_assert((std::is_same<std::atomic<std::intptr_t>,  std::atomic_intptr_t>::value), "");
+    static_assert((std::is_same<std::atomic<std::uintptr_t>, std::atomic_uintptr_t>::value), "");
 
     static_assert((std::is_same<std::atomic<int8_t>,    std::atomic_int8_t>::value), "");
     static_assert((std::is_same<std::atomic<uint8_t>,   std::atomic_uint8_t>::value), "");
diff --git a/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp b/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
index 3121e7c57bafe..28125888f27df 100644
--- a/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
+++ b/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
@@ -200,8 +200,8 @@ void f() {
   static_assert(std::is_same_v<std::atomic<int_fast64_t>,   ::atomic_int_fast64_t>);
   static_assert(std::is_same_v<std::atomic<uint_fast64_t>,  ::atomic_uint_fast64_t>);
 
-  static_assert(std::is_same_v<std::atomic<intptr_t>,  ::atomic_intptr_t>);
-  static_assert(std::is_same_v<std::atomic<uintptr_t>, ::atomic_uintptr_t>);
+  static_assert(std::is_same_v<std::atomic<std::intptr_t>,  ::atomic_intptr_t>);
+  static_assert(std::is_same_v<std::atomic<std::uintptr_t>, ::atomic_uintptr_t>);
   static_assert(std::is_same_v<std::atomic<std::size_t>,    ::atomic_size_t>);
   static_assert(std::is_same_v<std::atomic<std::ptrdiff_t>, ::atomic_ptrdiff_t>);
   static_assert(std::is_same_v<std::atomic<intmax_t>,  ::atomic_intmax_t>);
diff --git a/libcxx/test/std/atomics/types.pass.cpp b/libcxx/test/std/atomics/types.pass.cpp
index 63ab0f30c4a75..cb8dde0d513db 100644
--- a/libcxx/test/std/atomics/types.pass.cpp
+++ b/libcxx/test/std/atomics/types.pass.cpp
@@ -146,8 +146,8 @@ int main(int, char**)
     test< std::int64_t> ();
     test<std::uint64_t> ();
 
-    test<intptr_t>  ();
-    test<uintptr_t> ();
+    test<std::intptr_t>  ();
+    test<std::uintptr_t> ();
     test<std::size_t>    ();
     test<std::ptrdiff_t> ();
     test<intmax_t>  ();
diff --git a/libcxx/test/std/experimental/simd/simd.cons/generator.pass.cpp b/libcxx/test/std/experimental/simd/simd.cons/generator.pass.cpp
index b28e1af29a89e..05a12895e9213 100644
--- a/libcxx/test/std/experimental/simd/simd.cons/generator.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.cons/generator.pass.cpp
@@ -47,7 +47,7 @@ struct identity {
 void compile_generator() {
   supported_simd128_ctor<int>(identity());
   not_supported_simd128_ctor<int>([](int i) { return float(i); });
-  not_supported_simd128_ctor<int>([](intptr_t i) { return (int*)(i); });
+  not_supported_simd128_ctor<int>([](std::intptr_t i) { return (int*)(i); });
   not_supported_simd128_ctor<int>([](int* i) { return i; });
 }
 
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.fail.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.fail.cpp
index 1d4699c18d8ea..89e2b6bf397b0 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.fail.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.fail.cpp
@@ -45,7 +45,7 @@ int main(int, char**)
     static_assert(toobig<std::uint64_t>(), "");  // expected-error-re {{{{(static_assert|static assertion)}} expression is not an integral constant expression}}
     static_assert(toobig<std::size_t>(), "");    // expected-error-re {{{{(static_assert|static assertion)}} expression is not an integral constant expression}}
     static_assert(toobig<uintmax_t>(), ""); // expected-error-re {{{{(static_assert|static assertion)}} expression is not an integral constant expression}}
-    static_assert(toobig<uintptr_t>(), ""); // expected-error-re {{{{(static_assert|static assertion)}} expression is not an integral constant expression}}
+    static_assert(toobig<std::uintptr_t>(), ""); // expected-error-re {{{{(static_assert|static assertion)}} expression is not an integral constant expression}}
 
     return 0;
 }
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
index cbaf5dfbec19c..a4d70dc8c52c9 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
@@ -89,7 +89,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -120,7 +120,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -136,7 +136,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
index 473238be5e92c..0af4cddb074d1 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
@@ -86,7 +86,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -118,7 +118,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -135,7 +135,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
index 8110048e13960..747b4e02bac5c 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
@@ -91,7 +91,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -122,7 +122,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -138,7 +138,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
index 1c30f5cec5191..398fee8cafc1d 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
@@ -89,7 +89,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -120,7 +120,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -136,7 +136,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
index b236e37ee8791..7b8f6da809b26 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
@@ -86,7 +86,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -117,7 +117,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -133,7 +133,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
index 58c953f0b97a8..4b0f8156d621b 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
@@ -85,7 +85,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -116,7 +116,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -132,7 +132,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
index 208e694e0282a..b88a770745d5d 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
@@ -90,7 +90,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -121,7 +121,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -137,7 +137,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
index 0d14d9e71044b..2f2f81d961ad9 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
@@ -87,7 +87,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -118,7 +118,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -134,7 +134,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
index 383338a2f21bd..605b84ed35158 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
@@ -97,7 +97,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -128,7 +128,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -144,7 +144,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
index 6cc1410eb682f..2b56ae15682e0 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
@@ -85,7 +85,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -117,7 +117,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -134,7 +134,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
index b218bb0397335..fee122fe607b5 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
@@ -86,7 +86,7 @@ int main(int, char**)
     static_assert(!std::is_invocable_v<L, std::int32_t>);
     static_assert(!std::is_invocable_v<L, std::int64_t>);
     static_assert(!std::is_invocable_v<L, intmax_t>);
-    static_assert(!std::is_invocable_v<L, intptr_t>);
+    static_assert(!std::is_invocable_v<L, std::intptr_t>);
     static_assert(!std::is_invocable_v<L, std::ptrdiff_t>);
 
     static_assert(!std::is_invocable_v<L, bool>);
@@ -118,7 +118,7 @@ int main(int, char**)
     static_assert(test<std::uint32_t>());
     static_assert(test<std::uint64_t>());
     static_assert(test<uintmax_t>());
-    static_assert(test<uintptr_t>());
+    static_assert(test<std::uintptr_t>());
     static_assert(test<std::size_t>());
 
     test<unsigned char>();
@@ -135,7 +135,7 @@ int main(int, char**)
     test<std::uint32_t>();
     test<std::uint64_t>();
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
     test<std::size_t>();
 
     return 0;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.elements/iterator/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.elements/iterator/ctor.other.pass.cpp
index 6756474024821..4c94cd5edf62e 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.elements/iterator/ctor.other.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.elements/iterator/ctor.other.pass.cpp
@@ -21,7 +21,7 @@ template <bool Const>
 struct ConvertibleIter : IterBase<ConvertibleIter<Const>> {
   using iterator_category = std::random_access_iterator_tag;
   using value_type        = std::tuple<int>;
-  using difference_type   = intptr_t;
+  using difference_type   = std::intptr_t;
 
   bool movedFromOtherConst = false;
   int i                    = 0;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.elements/sentinel/equality.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.elements/sentinel/equality.pass.cpp
index 55477cc997587..df95e07c97d97 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.elements/sentinel/equality.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.elements/sentinel/equality.pass.cpp
@@ -23,7 +23,7 @@ struct Iter {
   std::tuple<int>* it_;
 
   using value_type       = std::tuple<int>;
-  using difference_type  = intptr_t;
+  using difference_type  = std::intptr_t;
   using iterator_concept = std::input_iterator_tag;
 
   constexpr decltype(auto) operator*() const { return *it_; }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.elements/types.h b/libcxx/test/std/ranges/range.adaptors/range.elements/types.h
index a1c0884b60719..f1ee165c3cc63 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.elements/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.elements/types.h
@@ -93,7 +93,7 @@ template <class Derived>
 struct IterBase {
   using iterator_concept = std::random_access_iterator_tag;
   using value_type       = std::tuple<int>;
-  using difference_type  = intptr_t;
+  using difference_type  = std::intptr_t;
 
   constexpr std::tuple<int> operator*() const { return std::tuple<int>(5); }
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.split/types.h b/libcxx/test/std/ranges/range.adaptors/range.split/types.h
index ff2ce38317cd9..aa47faf0f468b 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.split/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.split/types.h
@@ -20,7 +20,7 @@ template <class Derived>
 struct ForwardIterBase {
   using iterator_concept = std::forward_iterator_tag;
   using value_type       = int;
-  using difference_type  = intptr_t;
+  using difference_type  = std::intptr_t;
 
   constexpr int operator*() const { return 5; }
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.take.while/sentinel/equality.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.take.while/sentinel/equality.pass.cpp
index 3d5b835c01c27..db3e5764421af 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.take.while/sentinel/equality.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.take.while/sentinel/equality.pass.cpp
@@ -26,7 +26,7 @@ struct Iter {
   int* it_;
 
   using value_type       = int;
-  using difference_type  = intptr_t;
+  using difference_type  = std::intptr_t;
   using iterator_concept = std::input_iterator_tag;
 
   constexpr decltype(auto) operator*() const { return *it_; }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/arithmetic.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/arithmetic.pass.cpp
index ce7858f1a8921..efe64b31f79fb 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/arithmetic.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/arithmetic.pass.cpp
@@ -63,7 +63,7 @@ constexpr bool test() {
     assert(&y1 == &(b[3]));
 
     using Iter = decltype(it1);
-    static_assert(canPlusEqual<Iter, intptr_t>);
+    static_assert(canPlusEqual<Iter, std::intptr_t>);
   }
 
   {
@@ -83,7 +83,7 @@ constexpr bool test() {
     assert(&y1 == &(b[2]));
 
     using Iter = decltype(it1);
-    static_assert(canMinusEqual<Iter, intptr_t>);
+    static_assert(canMinusEqual<Iter, std::intptr_t>);
   }
 
   {
@@ -116,12 +116,12 @@ constexpr bool test() {
     // One of the ranges is not random access
     std::ranges::zip_view v(a, b, ForwardSizedView{buffer1});
     using Iter = decltype(v.begin());
-    static_assert(!std::invocable<std::plus<>, Iter, intptr_t>);
-    static_assert(!std::invocable<std::plus<>, intptr_t, Iter>);
-    static_assert(!canPlusEqual<Iter, intptr_t>);
-    static_assert(!std::invocable<std::minus<>, Iter, intptr_t>);
+    static_assert(!std::invocable<std::plus<>, Iter, std::intptr_t>);
+    static_assert(!std::invocable<std::plus<>, std::intptr_t, Iter>);
+    static_assert(!canPlusEqual<Iter, std::intptr_t>);
+    static_assert(!std::invocable<std::minus<>, Iter, std::intptr_t>);
     static_assert(std::invocable<std::minus<>, Iter, Iter>);
-    static_assert(!canMinusEqual<Iter, intptr_t>);
+    static_assert(!canMinusEqual<Iter, std::intptr_t>);
   }
 
   {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
index 19b5b99993495..fcbff722c39b3 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
@@ -41,7 +41,7 @@ struct LessThanIterator {
 
   using iterator_category = std::random_access_iterator_tag;
   using value_type = int;
-  using difference_type = intptr_t;
+  using difference_type = std::intptr_t;
 
   constexpr int& operator*() const { return *it_; }
   constexpr int& operator[](difference_type n) const { return it_[n]; }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.default.pass.cpp
index 8c038abdb24f5..98078b2ce3095 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.default.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.default.pass.cpp
@@ -20,7 +20,7 @@ struct PODIter {
 
   using iterator_category = std::random_access_iterator_tag;
   using value_type = int;
-  using difference_type = intptr_t;
+  using difference_type = std::intptr_t;
 
   constexpr int operator*() const { return i; }
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
index 6b0c086d8c4fc..c19f6c2b16524 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
@@ -144,16 +144,16 @@ void test() {
 
   {
     // difference_type of single view
-    std::ranges::zip_view v{DiffTypeRange<intptr_t>{}};
+    std::ranges::zip_view v{DiffTypeRange<std::intptr_t>{}};
     using Iter = decltype(v.begin());
-    static_assert(std::is_same_v<Iter::difference_type, intptr_t>);
+    static_assert(std::is_same_v<Iter::difference_type, std::intptr_t>);
   }
 
   {
     // difference_type of multiple views should be the common type
-    std::ranges::zip_view v{DiffTypeRange<intptr_t>{}, DiffTypeRange<std::ptrdiff_t>{}};
+    std::ranges::zip_view v{DiffTypeRange<std::intptr_t>{}, DiffTypeRange<std::ptrdiff_t>{}};
     using Iter = decltype(v.begin());
-    static_assert(std::is_same_v<Iter::difference_type, std::common_type_t<intptr_t, std::ptrdiff_t>>);
+    static_assert(std::is_same_v<Iter::difference_type, std::common_type_t<std::intptr_t, std::ptrdiff_t>>);
   }
 
   const std::array foos{Foo{}};
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/minus.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/minus.pass.cpp
index 7f8b513a97717..c4c85bc24e1e8 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/minus.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/minus.pass.cpp
@@ -34,7 +34,7 @@ struct convertible_forward_sized_iterator {
 
   using iterator_category = std::forward_iterator_tag;
   using value_type = int;
-  using difference_type = intptr_t;
+  using difference_type = std::intptr_t;
 
   convertible_forward_sized_iterator() = default;
   constexpr convertible_forward_sized_iterator(Base it) : it_(it) {}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/types.h b/libcxx/test/std/ranges/range.adaptors/range.zip/types.h
index 299ffeac0489a..fa82b836f529b 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/types.h
@@ -201,7 +201,7 @@ struct forward_sized_iterator {
 
   using iterator_category = std::forward_iterator_tag;
   using value_type = int;
-  using difference_type = intptr_t;
+  using difference_type = std::intptr_t;
   using pointer = Base;
   using reference = decltype(*Base{});
 
@@ -405,7 +405,7 @@ struct iter_move_swap_iterator {
 
   using iterator_category = std::input_iterator_tag;
   using value_type = int;
-  using difference_type = intptr_t;
+  using difference_type = std::intptr_t;
 
   constexpr int operator*() const { return i; }
 
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
index 37d50f3d17017..83a3df3d1e447 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
@@ -59,7 +59,7 @@ void test(StringT expected, StringViewT fmt, PointerT arg) {
     buffer[0] = CharT('0');
     buffer[1] = CharT('x');
     expected.append(buffer.begin(),
-                    std::to_chars(buffer.begin() + 2, buffer.end(), reinterpret_cast<uintptr_t>(arg), 16).ptr);
+                    std::to_chars(buffer.begin() + 2, buffer.end(), reinterpret_cast<std::uintptr_t>(arg), 16).ptr);
   }
   assert(result == expected);
 }
diff --git a/libcxx/test/std/utilities/function.objects/unord.hash/integral.pass.cpp b/libcxx/test/std/utilities/function.objects/unord.hash/integral.pass.cpp
index c645ad8f476f1..124eb843d298e 100644
--- a/libcxx/test/std/utilities/function.objects/unord.hash/integral.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/unord.hash/integral.pass.cpp
@@ -90,7 +90,7 @@ int main(int, char**)
     test<std::int_least64_t>();
 
     test<intmax_t>();
-    test<intptr_t>();
+    test<std::intptr_t>();
 
     test<std::uint8_t>();
     test<std::uint16_t>();
@@ -108,7 +108,7 @@ int main(int, char**)
     test<std::uint_least64_t>();
 
     test<uintmax_t>();
-    test<uintptr_t>();
+    test<std::uintptr_t>();
 
 #ifndef TEST_HAS_NO_INT128
     test<__int128_t>();
diff --git a/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp b/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
index a4f534e4b212c..3e1006ad984cd 100644
--- a/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
+++ b/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
@@ -40,7 +40,7 @@ int main(int, char**)
 {
     std::pair<A*, std::ptrdiff_t> ip = std::get_temporary_buffer<A>(5);
     assert(!(ip.first == nullptr) ^ (ip.second == 0));
-    assert(reinterpret_cast<uintptr_t>(ip.first) % alignof(A) == 0);
+    assert(reinterpret_cast<std::uintptr_t>(ip.first) % alignof(A) == 0);
     std::return_temporary_buffer(ip.first);
 
   return 0;
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared_construct.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared_construct.pass.cpp
index e354d4a2721d5..4281cc1aa9e0d 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared_construct.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared_construct.pass.cpp
@@ -123,7 +123,7 @@ struct Bar {
 };
 
 void test_aligned(void* p, std::size_t align) {
-  assert(reinterpret_cast<uintptr_t>(p) % align == 0);
+  assert(reinterpret_cast<std::uintptr_t>(p) % align == 0);
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/std/utilities/meta/meta.unary.prop.query/alignment_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary.prop.query/alignment_of.pass.cpp
index a1c24b745b441..66318951a8c97 100644
--- a/libcxx/test/std/utilities/meta/meta.unary.prop.query/alignment_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary.prop.query/alignment_of.pass.cpp
@@ -43,8 +43,8 @@ int main(int, char**)
 {
     test_alignment_of<int&, 4>();
     test_alignment_of<Class, 1>();
-    test_alignment_of<int*, sizeof(intptr_t)>();
-    test_alignment_of<const int*, sizeof(intptr_t)>();
+    test_alignment_of<int*, sizeof(std::intptr_t)>();
+    test_alignment_of<const int*, sizeof(std::intptr_t)>();
     test_alignment_of<char[3], 1>();
     test_alignment_of<int, 4>();
     test_alignment_of<double, TEST_ALIGNOF(double)>();

From 71a5958406fb8d13ed3692db7696b68f59752053 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 22 Mar 2023 22:20:10 +0100
Subject: [PATCH 043/208] [libc++] Remove __mutex_base header

This header should have been removed in
https://reviews.llvm.org/D146228, but there was a merge conflict.
---
 libcxx/include/__mutex_base | 521 ------------------------------------
 1 file changed, 521 deletions(-)
 delete mode 100644 libcxx/include/__mutex_base

diff --git a/libcxx/include/__mutex_base b/libcxx/include/__mutex_base
deleted file mode 100644
index 191955363a2d3..0000000000000
--- a/libcxx/include/__mutex_base
+++ /dev/null
@@ -1,521 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___MUTEX_BASE
-#define _LIBCPP___MUTEX_BASE
-
-#include <__chrono/duration.h>
-#include <__chrono/steady_clock.h>
-#include <__chrono/system_clock.h>
-#include <__chrono/time_point.h>
-#include <__config>
-#include <__threading_support>
-#include <__type_traits/is_floating_point.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <ratio>
-#include <system_error>
-#include <time.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-#ifndef _LIBCPP_HAS_NO_THREADS
-
-class _LIBCPP_TYPE_VIS _LIBCPP_THREAD_SAFETY_ANNOTATION(capability("mutex")) mutex
-{
-    __libcpp_mutex_t __m_ = _LIBCPP_MUTEX_INITIALIZER;
-
-public:
-    _LIBCPP_INLINE_VISIBILITY
-    _LIBCPP_CONSTEXPR mutex() = default;
-
-    mutex(const mutex&) = delete;
-    mutex& operator=(const mutex&) = delete;
-
-#if defined(_LIBCPP_HAS_TRIVIAL_MUTEX_DESTRUCTION)
-    ~mutex() = default;
-#else
-    ~mutex() _NOEXCEPT;
-#endif
-
-    void lock() _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability());
-    bool try_lock() _NOEXCEPT _LIBCPP_THREAD_SAFETY_ANNOTATION(try_acquire_capability(true));
-    void unlock() _NOEXCEPT _LIBCPP_THREAD_SAFETY_ANNOTATION(release_capability());
-
-    typedef __libcpp_mutex_t* native_handle_type;
-    _LIBCPP_INLINE_VISIBILITY native_handle_type native_handle() {return &__m_;}
-};
-
-static_assert(is_nothrow_default_constructible<mutex>::value,
-              "the default constructor for std::mutex must be nothrow");
-
-struct _LIBCPP_TYPE_VIS defer_lock_t { explicit defer_lock_t() = default; };
-struct _LIBCPP_TYPE_VIS try_to_lock_t { explicit try_to_lock_t() = default; };
-struct _LIBCPP_TYPE_VIS adopt_lock_t { explicit adopt_lock_t() = default; };
-
-#  if defined(_LIBCPP_BUILDING_LIBRARY)
-extern _LIBCPP_EXPORTED_FROM_ABI const defer_lock_t defer_lock;
-extern _LIBCPP_EXPORTED_FROM_ABI const try_to_lock_t try_to_lock;
-extern _LIBCPP_EXPORTED_FROM_ABI const adopt_lock_t adopt_lock;
-#  elif !defined(_LIBCPP_CXX03_LANG)
-/* inline */ constexpr defer_lock_t defer_lock = defer_lock_t();
-/* inline */ constexpr try_to_lock_t try_to_lock = try_to_lock_t();
-/* inline */ constexpr adopt_lock_t adopt_lock = adopt_lock_t();
-#  endif
-
-template <class _Mutex>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_THREAD_SAFETY_ANNOTATION(scoped_lockable)
-lock_guard
-{
-public:
-    typedef _Mutex mutex_type;
-
-private:
-    mutex_type& __m_;
-public:
-
-    _LIBCPP_NODISCARD_EXT _LIBCPP_INLINE_VISIBILITY
-    explicit lock_guard(mutex_type& __m) _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
-        : __m_(__m) {__m_.lock();}
-
-    _LIBCPP_NODISCARD_EXT _LIBCPP_INLINE_VISIBILITY
-    lock_guard(mutex_type& __m, adopt_lock_t) _LIBCPP_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
-        : __m_(__m) {}
-    _LIBCPP_INLINE_VISIBILITY
-    ~lock_guard() _LIBCPP_THREAD_SAFETY_ANNOTATION(release_capability()) {__m_.unlock();}
-
-private:
-    lock_guard(lock_guard const&) = delete;
-    lock_guard& operator=(lock_guard const&) = delete;
-};
-_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(lock_guard);
-
-template <class _Mutex>
-class _LIBCPP_TEMPLATE_VIS unique_lock
-{
-public:
-    typedef _Mutex mutex_type;
-
-private:
-    mutex_type* __m_;
-    bool __owns_;
-
-public:
-    _LIBCPP_INLINE_VISIBILITY
-    unique_lock() _NOEXCEPT : __m_(nullptr), __owns_(false) {}
-    _LIBCPP_INLINE_VISIBILITY
-    explicit unique_lock(mutex_type& __m)
-        : __m_(_VSTD::addressof(__m)), __owns_(true) {__m_->lock();}
-    _LIBCPP_INLINE_VISIBILITY
-    unique_lock(mutex_type& __m, defer_lock_t) _NOEXCEPT
-        : __m_(_VSTD::addressof(__m)), __owns_(false) {}
-    _LIBCPP_INLINE_VISIBILITY
-    unique_lock(mutex_type& __m, try_to_lock_t)
-        : __m_(_VSTD::addressof(__m)), __owns_(__m.try_lock()) {}
-    _LIBCPP_INLINE_VISIBILITY
-    unique_lock(mutex_type& __m, adopt_lock_t)
-        : __m_(_VSTD::addressof(__m)), __owns_(true) {}
-    template <class _Clock, class _Duration>
-    _LIBCPP_INLINE_VISIBILITY
-        unique_lock(mutex_type& __m, const chrono::time_point<_Clock, _Duration>& __t)
-            : __m_(_VSTD::addressof(__m)), __owns_(__m.try_lock_until(__t)) {}
-    template <class _Rep, class _Period>
-    _LIBCPP_INLINE_VISIBILITY
-        unique_lock(mutex_type& __m, const chrono::duration<_Rep, _Period>& __d)
-            : __m_(_VSTD::addressof(__m)), __owns_(__m.try_lock_for(__d)) {}
-    _LIBCPP_INLINE_VISIBILITY
-    ~unique_lock()
-    {
-        if (__owns_)
-            __m_->unlock();
-    }
-
-    unique_lock(unique_lock const&) = delete;
-    unique_lock& operator=(unique_lock const&) = delete;
-
-    _LIBCPP_INLINE_VISIBILITY
-    unique_lock(unique_lock&& __u) _NOEXCEPT
-        : __m_(__u.__m_), __owns_(__u.__owns_)
-        {__u.__m_ = nullptr; __u.__owns_ = false;}
-    _LIBCPP_INLINE_VISIBILITY
-    unique_lock& operator=(unique_lock&& __u) _NOEXCEPT
-        {
-            if (__owns_)
-                __m_->unlock();
-            __m_ = __u.__m_;
-            __owns_ = __u.__owns_;
-            __u.__m_ = nullptr;
-            __u.__owns_ = false;
-            return *this;
-        }
-
-    void lock();
-    bool try_lock();
-
-    template <class _Rep, class _Period>
-        bool try_lock_for(const chrono::duration<_Rep, _Period>& __d);
-    template <class _Clock, class _Duration>
-        bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t);
-
-    void unlock();
-
-    _LIBCPP_INLINE_VISIBILITY
-    void swap(unique_lock& __u) _NOEXCEPT
-    {
-        _VSTD::swap(__m_, __u.__m_);
-        _VSTD::swap(__owns_, __u.__owns_);
-    }
-    _LIBCPP_INLINE_VISIBILITY
-    mutex_type* release() _NOEXCEPT
-    {
-        mutex_type* __m = __m_;
-        __m_ = nullptr;
-        __owns_ = false;
-        return __m;
-    }
-
-    _LIBCPP_INLINE_VISIBILITY
-    bool owns_lock() const _NOEXCEPT {return __owns_;}
-    _LIBCPP_INLINE_VISIBILITY
-    explicit operator bool() const _NOEXCEPT {return __owns_;}
-    _LIBCPP_INLINE_VISIBILITY
-    mutex_type* mutex() const _NOEXCEPT {return __m_;}
-};
-_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(unique_lock);
-
-template <class _Mutex>
-void
-unique_lock<_Mutex>::lock()
-{
-    if (__m_ == nullptr)
-        __throw_system_error(EPERM, "unique_lock::lock: references null mutex");
-    if (__owns_)
-        __throw_system_error(EDEADLK, "unique_lock::lock: already locked");
-    __m_->lock();
-    __owns_ = true;
-}
-
-template <class _Mutex>
-bool
-unique_lock<_Mutex>::try_lock()
-{
-    if (__m_ == nullptr)
-        __throw_system_error(EPERM, "unique_lock::try_lock: references null mutex");
-    if (__owns_)
-        __throw_system_error(EDEADLK, "unique_lock::try_lock: already locked");
-    __owns_ = __m_->try_lock();
-    return __owns_;
-}
-
-template <class _Mutex>
-template <class _Rep, class _Period>
-bool
-unique_lock<_Mutex>::try_lock_for(const chrono::duration<_Rep, _Period>& __d)
-{
-    if (__m_ == nullptr)
-        __throw_system_error(EPERM, "unique_lock::try_lock_for: references null mutex");
-    if (__owns_)
-        __throw_system_error(EDEADLK, "unique_lock::try_lock_for: already locked");
-    __owns_ = __m_->try_lock_for(__d);
-    return __owns_;
-}
-
-template <class _Mutex>
-template <class _Clock, class _Duration>
-bool
-unique_lock<_Mutex>::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
-{
-    if (__m_ == nullptr)
-        __throw_system_error(EPERM, "unique_lock::try_lock_until: references null mutex");
-    if (__owns_)
-        __throw_system_error(EDEADLK, "unique_lock::try_lock_until: already locked");
-    __owns_ = __m_->try_lock_until(__t);
-    return __owns_;
-}
-
-template <class _Mutex>
-void
-unique_lock<_Mutex>::unlock()
-{
-    if (!__owns_)
-        __throw_system_error(EPERM, "unique_lock::unlock: not locked");
-    __m_->unlock();
-    __owns_ = false;
-}
-
-template <class _Mutex>
-inline _LIBCPP_INLINE_VISIBILITY
-void
-swap(unique_lock<_Mutex>& __x, unique_lock<_Mutex>& __y) _NOEXCEPT
-    {__x.swap(__y);}
-
-//enum class cv_status
-_LIBCPP_DECLARE_STRONG_ENUM(cv_status)
-{
-    no_timeout,
-    timeout
-};
-_LIBCPP_DECLARE_STRONG_ENUM_EPILOG(cv_status)
-
-class _LIBCPP_TYPE_VIS condition_variable
-{
-    __libcpp_condvar_t __cv_ = _LIBCPP_CONDVAR_INITIALIZER;
-public:
-    _LIBCPP_INLINE_VISIBILITY
-    _LIBCPP_CONSTEXPR condition_variable() _NOEXCEPT = default;
-
-#ifdef _LIBCPP_HAS_TRIVIAL_CONDVAR_DESTRUCTION
-    ~condition_variable() = default;
-#else
-    ~condition_variable();
-#endif
-
-    condition_variable(const condition_variable&) = delete;
-    condition_variable& operator=(const condition_variable&) = delete;
-
-    void notify_one() _NOEXCEPT;
-    void notify_all() _NOEXCEPT;
-
-    void wait(unique_lock<mutex>& __lk) _NOEXCEPT;
-    template <class _Predicate>
-        _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-        void wait(unique_lock<mutex>& __lk, _Predicate __pred);
-
-    template <class _Clock, class _Duration>
-        _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-        cv_status
-        wait_until(unique_lock<mutex>& __lk,
-                   const chrono::time_point<_Clock, _Duration>& __t);
-
-    template <class _Clock, class _Duration, class _Predicate>
-        _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-        bool
-        wait_until(unique_lock<mutex>& __lk,
-                   const chrono::time_point<_Clock, _Duration>& __t,
-                   _Predicate __pred);
-
-    template <class _Rep, class _Period>
-        _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-        cv_status
-        wait_for(unique_lock<mutex>& __lk,
-                 const chrono::duration<_Rep, _Period>& __d);
-
-    template <class _Rep, class _Period, class _Predicate>
-        bool
-        _LIBCPP_INLINE_VISIBILITY
-        wait_for(unique_lock<mutex>& __lk,
-                 const chrono::duration<_Rep, _Period>& __d,
-                 _Predicate __pred);
-
-    typedef __libcpp_condvar_t* native_handle_type;
-    _LIBCPP_INLINE_VISIBILITY native_handle_type native_handle() {return &__cv_;}
-
-private:
-    void __do_timed_wait(unique_lock<mutex>& __lk,
-       chrono::time_point<chrono::system_clock, chrono::nanoseconds>) _NOEXCEPT;
-#if defined(_LIBCPP_HAS_COND_CLOCKWAIT)
-    void __do_timed_wait(unique_lock<mutex>& __lk,
-       chrono::time_point<chrono::steady_clock, chrono::nanoseconds>) _NOEXCEPT;
-#endif
-    template <class _Clock>
-    void __do_timed_wait(unique_lock<mutex>& __lk,
-       chrono::time_point<_Clock, chrono::nanoseconds>) _NOEXCEPT;
-};
-#endif // !_LIBCPP_HAS_NO_THREADS
-
-template <class _Rep, class _Period>
-inline _LIBCPP_INLINE_VISIBILITY
-__enable_if_t<is_floating_point<_Rep>::value, chrono::nanoseconds>
-__safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
-{
-    using namespace chrono;
-    using __ratio = ratio_divide<_Period, nano>;
-    using __ns_rep = nanoseconds::rep;
-    _Rep __result_float = __d.count() * __ratio::num / __ratio::den;
-
-    _Rep __result_max = numeric_limits<__ns_rep>::max();
-    if (__result_float >= __result_max) {
-        return nanoseconds::max();
-    }
-
-    _Rep __result_min = numeric_limits<__ns_rep>::min();
-    if (__result_float <= __result_min) {
-        return nanoseconds::min();
-    }
-
-    return nanoseconds(static_cast<__ns_rep>(__result_float));
-}
-
-template <class _Rep, class _Period>
-inline _LIBCPP_INLINE_VISIBILITY
-__enable_if_t<!is_floating_point<_Rep>::value, chrono::nanoseconds>
-__safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
-{
-    using namespace chrono;
-    if (__d.count() == 0) {
-        return nanoseconds(0);
-    }
-
-    using __ratio = ratio_divide<_Period, nano>;
-    using __ns_rep = nanoseconds::rep;
-    __ns_rep __result_max = numeric_limits<__ns_rep>::max();
-    if (__d.count() > 0 && __d.count() > __result_max / __ratio::num) {
-        return nanoseconds::max();
-    }
-
-    __ns_rep __result_min = numeric_limits<__ns_rep>::min();
-    if (__d.count() < 0 && __d.count() < __result_min / __ratio::num) {
-        return nanoseconds::min();
-    }
-
-    __ns_rep __result = __d.count() * __ratio::num / __ratio::den;
-    if (__result == 0) {
-        return nanoseconds(1);
-    }
-
-    return nanoseconds(__result);
-}
-
-#ifndef _LIBCPP_HAS_NO_THREADS
-template <class _Predicate>
-void
-condition_variable::wait(unique_lock<mutex>& __lk, _Predicate __pred)
-{
-    while (!__pred())
-        wait(__lk);
-}
-
-template <class _Clock, class _Duration>
-cv_status
-condition_variable::wait_until(unique_lock<mutex>& __lk,
-                               const chrono::time_point<_Clock, _Duration>& __t)
-{
-    using namespace chrono;
-    using __clock_tp_ns = time_point<_Clock, nanoseconds>;
-
-    typename _Clock::time_point __now = _Clock::now();
-    if (__t <= __now)
-        return cv_status::timeout;
-
-    __clock_tp_ns __t_ns = __clock_tp_ns(_VSTD::__safe_nanosecond_cast(__t.time_since_epoch()));
-
-    __do_timed_wait(__lk, __t_ns);
-    return _Clock::now() < __t ? cv_status::no_timeout : cv_status::timeout;
-}
-
-template <class _Clock, class _Duration, class _Predicate>
-bool
-condition_variable::wait_until(unique_lock<mutex>& __lk,
-                   const chrono::time_point<_Clock, _Duration>& __t,
-                   _Predicate __pred)
-{
-    while (!__pred())
-    {
-        if (wait_until(__lk, __t) == cv_status::timeout)
-            return __pred();
-    }
-    return true;
-}
-
-template <class _Rep, class _Period>
-cv_status
-condition_variable::wait_for(unique_lock<mutex>& __lk,
-                             const chrono::duration<_Rep, _Period>& __d)
-{
-    using namespace chrono;
-    if (__d <= __d.zero())
-        return cv_status::timeout;
-    using __ns_rep = nanoseconds::rep;
-    steady_clock::time_point __c_now = steady_clock::now();
-
-#if defined(_LIBCPP_HAS_COND_CLOCKWAIT)
-    using __clock_tp_ns = time_point<steady_clock, nanoseconds>;
-    __ns_rep __now_count_ns = _VSTD::__safe_nanosecond_cast(__c_now.time_since_epoch()).count();
-#else
-    using __clock_tp_ns = time_point<system_clock, nanoseconds>;
-    __ns_rep __now_count_ns = _VSTD::__safe_nanosecond_cast(system_clock::now().time_since_epoch()).count();
-#endif
-
-    __ns_rep __d_ns_count = _VSTD::__safe_nanosecond_cast(__d).count();
-
-    if (__now_count_ns > numeric_limits<__ns_rep>::max() - __d_ns_count) {
-        __do_timed_wait(__lk, __clock_tp_ns::max());
-    } else {
-        __do_timed_wait(__lk, __clock_tp_ns(nanoseconds(__now_count_ns + __d_ns_count)));
-    }
-
-    return steady_clock::now() - __c_now < __d ? cv_status::no_timeout :
-                                                 cv_status::timeout;
-}
-
-template <class _Rep, class _Period, class _Predicate>
-inline
-bool
-condition_variable::wait_for(unique_lock<mutex>& __lk,
-                             const chrono::duration<_Rep, _Period>& __d,
-                             _Predicate __pred)
-{
-    return wait_until(__lk, chrono::steady_clock::now() + __d,
-                      _VSTD::move(__pred));
-}
-
-#if defined(_LIBCPP_HAS_COND_CLOCKWAIT)
-inline
-void
-condition_variable::__do_timed_wait(unique_lock<mutex>& __lk,
-     chrono::time_point<chrono::steady_clock, chrono::nanoseconds> __tp) _NOEXCEPT
-{
-    using namespace chrono;
-    if (!__lk.owns_lock())
-        __throw_system_error(EPERM,
-                            "condition_variable::timed wait: mutex not locked");
-    nanoseconds __d = __tp.time_since_epoch();
-    timespec __ts;
-    seconds __s = duration_cast<seconds>(__d);
-    using __ts_sec = decltype(__ts.tv_sec);
-    const __ts_sec __ts_sec_max = numeric_limits<__ts_sec>::max();
-    if (__s.count() < __ts_sec_max)
-    {
-        __ts.tv_sec = static_cast<__ts_sec>(__s.count());
-        __ts.tv_nsec = (__d - __s).count();
-    }
-    else
-    {
-        __ts.tv_sec = __ts_sec_max;
-        __ts.tv_nsec = giga::num - 1;
-    }
-    int __ec = pthread_cond_clockwait(&__cv_, __lk.mutex()->native_handle(), CLOCK_MONOTONIC, &__ts);
-    if (__ec != 0 && __ec != ETIMEDOUT)
-        __throw_system_error(__ec, "condition_variable timed_wait failed");
-}
-#endif // _LIBCPP_HAS_COND_CLOCKWAIT
-
-template <class _Clock>
-inline
-void
-condition_variable::__do_timed_wait(unique_lock<mutex>& __lk,
-     chrono::time_point<_Clock, chrono::nanoseconds> __tp) _NOEXCEPT
-{
-    wait_for(__lk, __tp - _Clock::now());
-}
-
-#endif // !_LIBCPP_HAS_NO_THREADS
-
-_LIBCPP_END_NAMESPACE_STD
-
-_LIBCPP_POP_MACROS
-
-#endif // _LIBCPP___MUTEX_BASE

From 4d18d97b594ccaa3cbd79beb4afef45e4156dc8d Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Thu, 9 Mar 2023 11:10:32 -0800
Subject: [PATCH 044/208] [lldb] Fix dwim-print error message for missing expr

---
 lldb/source/Commands/CommandObjectDWIMPrint.cpp    | 10 ++++++----
 lldb/test/API/commands/dwim-print/TestDWIMPrint.py |  7 +++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index 419a27acc8181..ed816195350e9 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -61,14 +61,16 @@ bool CommandObjectDWIMPrint::DoExecute(StringRef command,
   OptionsWithRaw args{command};
   StringRef expr = args.GetRawPart();
 
+  if (expr.empty()) {
+    result.AppendErrorWithFormatv("'{0}' takes a variable or expression",
+                                  m_cmd_name);
+    return false;
+  }
+
   if (args.HasArgs()) {
     if (!ParseOptionsAndNotify(args.GetArgs(), result, m_option_group,
                                m_exe_ctx))
       return false;
-  } else if (command.empty()) {
-    result.AppendErrorWithFormatv("'{0}' takes a variable or expression",
-                                  m_cmd_name);
-    return false;
   }
 
   // If the user has not specified, default to disabling persistent results.
diff --git a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
index 22d18f91d0a59..9f69895f43692 100644
--- a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
+++ b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
@@ -107,3 +107,10 @@ def test_expression_language(self):
         lldbutil.run_to_name_breakpoint(self, "main")
         self._expect_cmd(f"dwim-print -l c++ -- argc", "frame variable")
         self._expect_cmd(f"dwim-print -l c++ -- argc + 1", "expression")
+
+    def test_empty_expression(self):
+        self.build()
+        lldbutil.run_to_name_breakpoint(self, "main")
+        error_msg = "error: 'dwim-print' takes a variable or expression"
+        self.expect(f"dwim-print", error=True, startstr=error_msg)
+        self.expect(f"dwim-print -- ", error=True, startstr=error_msg)

From fb8d894f23c5e805f0c87d89fb9d6c0eed3a0e72 Mon Sep 17 00:00:00 2001
From: Chia-hung Duan <chiahungduan@google.com>
Date: Wed, 22 Mar 2023 21:46:23 +0000
Subject: [PATCH 045/208] [scudo] Early exit from the case can't do page
 release.

There are heuristics to avoid marking blocks if there's little chance
to release pages. So far, those logics only exist in block-marking
section and we didn't leverage the results of those logics. For example,
in a round of releaseToOS try, we know it's still 128 KB away from the
release threshold. In the next round of releaseToOS, we can early exit
if the number of pushed bytes is smaller than 128 KB without looping
each memory group. This CL adds this heuristic and has reduced amount of
time in checking the status of each memory group.

This CL only applies this heuristic on SizeClassAllocator64.
SizeClassAllocator32 has a smaller region/group size and has little
impact on the default value.

Reviewed By: cferris

Differential Revision: https://reviews.llvm.org/D146312
---
 compiler-rt/lib/scudo/standalone/primary64.h | 120 ++++++++++++++-----
 1 file changed, 87 insertions(+), 33 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 1cb6d02f6cd6a..bca5ab82f3d59 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -64,32 +64,8 @@ template <typename Config> class SizeClassAllocator64 {
 
   void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
-    DCHECK_EQ(PrimaryBase, 0U);
-
-    // Reserve the space required for the Primary.
-    PrimaryBase = reinterpret_cast<uptr>(map(
-        nullptr, PrimarySize, "scudo:primary_reserve", MAP_NOACCESS, &Data));
 
-    u32 Seed;
-    const u64 Time = getMonotonicTimeFast();
-    if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
-      Seed = static_cast<u32>(Time ^ (PrimaryBase >> 12));
     const uptr PageSize = getPageSizeCached();
-    for (uptr I = 0; I < NumClasses; I++) {
-      RegionInfo *Region = getRegionInfo(I);
-      // The actual start of a region is offset by a random number of pages
-      // when PrimaryEnableRandomOffset is set.
-      Region->RegionBeg = (PrimaryBase + (I << Config::PrimaryRegionSizeLog)) +
-                          (Config::PrimaryEnableRandomOffset
-                               ? ((getRandomModN(&Seed, 16) + 1) * PageSize)
-                               : 0);
-      Region->RandState = getRandomU32(&Seed);
-      Region->ReleaseInfo.LastReleaseAtNs = Time;
-    }
-    shuffle(RegionInfoArray, NumClasses, &Seed);
-
-    setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
-
     const uptr GroupSize = (1U << GroupSizeLog);
     const uptr PagesInGroup = GroupSize / PageSize;
     const uptr MinSizeClass = getSizeByClassId(1);
@@ -126,6 +102,37 @@ template <typename Config> class SizeClassAllocator64 {
     // use its size of in-use blocks as a heuristic.
     SmallerBlockReleasePageDelta =
         PagesInGroup * (1 + MinSizeClass / 16U) / 100;
+
+    DCHECK_EQ(PrimaryBase, 0U);
+    // Reserve the space required for the Primary.
+    PrimaryBase = reinterpret_cast<uptr>(map(
+        nullptr, PrimarySize, "scudo:primary_reserve", MAP_NOACCESS, &Data));
+
+    u32 Seed;
+    const u64 Time = getMonotonicTimeFast();
+    if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
+      Seed = static_cast<u32>(Time ^ (PrimaryBase >> 12));
+
+    for (uptr I = 0; I < NumClasses; I++) {
+      RegionInfo *Region = getRegionInfo(I);
+      // The actual start of a region is offset by a random number of pages
+      // when PrimaryEnableRandomOffset is set.
+      Region->RegionBeg = (PrimaryBase + (I << Config::PrimaryRegionSizeLog)) +
+                          (Config::PrimaryEnableRandomOffset
+                               ? ((getRandomModN(&Seed, 16) + 1) * PageSize)
+                               : 0);
+      Region->RandState = getRandomU32(&Seed);
+      // Releasing small blocks is expensive, set a higher threshold to avoid
+      // frequent page releases.
+      if (isSmallBlock(getSizeByClassId(I)))
+        Region->TryReleaseThreshold = PageSize * SmallerBlockReleasePageDelta;
+      else
+        Region->TryReleaseThreshold = PageSize;
+      Region->ReleaseInfo.LastReleaseAtNs = Time;
+    }
+    shuffle(RegionInfoArray, NumClasses, &Seed);
+
+    setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
   void unmapTestOnly() NO_THREAD_SAFETY_ANALYSIS {
@@ -440,6 +447,8 @@ template <typename Config> class SizeClassAllocator64 {
     uptr MappedUser GUARDED_BY(Mutex) = 0;
     // Bytes allocated for user memory.
     uptr AllocatedUser GUARDED_BY(Mutex) = 0;
+    // The minimum size of pushed blocks to trigger page release.
+    uptr TryReleaseThreshold GUARDED_BY(Mutex) = 0;
     MapPlatformData Data GUARDED_BY(Mutex) = {};
     ReleaseToOsInfo ReleaseInfo GUARDED_BY(Mutex) = {};
     bool Exhausted GUARDED_BY(Mutex) = false;
@@ -486,6 +495,11 @@ template <typename Config> class SizeClassAllocator64 {
     return Base + (CompactPtrGroupBase << CompactPtrScale);
   }
 
+  ALWAYS_INLINE static bool isSmallBlock(uptr BlockSize) {
+    const uptr PageSize = getPageSizeCached();
+    return BlockSize < PageSize / 16U;
+  }
+
   // Push the blocks to their batch group. The layout will be like,
   //
   // FreeList - > BG -> BG -> BG
@@ -823,14 +837,15 @@ template <typename Config> class SizeClassAllocator64 {
       return 0; // Nothing new to release.
 
     const bool CheckDensity =
-        BlockSize < PageSize / 16U && ReleaseType != ReleaseToOS::ForceAll;
+        isSmallBlock(BlockSize) && ReleaseType != ReleaseToOS::ForceAll;
     // Releasing smaller blocks is expensive, so we want to make sure that a
     // significant amount of bytes are free, and that there has been a good
     // amount of batches pushed to the freelist before attempting to release.
     if (CheckDensity) {
       if (ReleaseType == ReleaseToOS::Normal &&
-          BytesPushed < Region->AllocatedUser / 16U)
+          BytesPushed < Region->TryReleaseThreshold) {
         return 0;
+      }
     }
 
     if (ReleaseType == ReleaseToOS::Normal) {
@@ -865,11 +880,18 @@ template <typename Config> class SizeClassAllocator64 {
     // of groups.
     uptr NumberOfBatchGroups = Region->FreeList.size();
 
+    // We are examining each group and will take the minimum distance to the
+    // release threshold as the next Region::TryReleaseThreshold(). Note that if
+    // the size of free blocks has reached the release threshold, the distance
+    // to the next release will be PageSize * SmallerBlockReleasePageDelta. See
+    // the comment on `SmallerBlockReleasePageDelta` for more details.
+    uptr MinDistToThreshold = GroupSize;
+
     for (BatchGroup *BG = Region->FreeList.front(), *Prev = nullptr;
          BG != nullptr;) {
       const uptr PushedBytesDelta =
-          BG->PushedBlocks - BG->PushedBlocksAtLastCheckpoint;
-      if (PushedBytesDelta * BlockSize < PageSize) {
+          (BG->PushedBlocks - BG->PushedBlocksAtLastCheckpoint) * BlockSize;
+      if (PushedBytesDelta < PageSize) {
         Prev = BG;
         BG = BG->Next;
         continue;
@@ -913,16 +935,38 @@ template <typename Config> class SizeClassAllocator64 {
       // that this heuristic only applies when all the spaces in a BatchGroup
       // are allocated.
       if (CheckDensity) {
-        const bool HighDensity = (BytesInBG * 100U) / AllocatedGroupSize >=
-                                 (100U - 1U - BlockSize / 16U);
+        const uptr ReleaseThreshold =
+            (AllocatedGroupSize * (100 - 1U - BlockSize / 16U)) / 100U;
+        const bool HighDensity = BytesInBG >= ReleaseThreshold;
         const bool MayHaveReleasedAll = NumBlocks >= (GroupSize / BlockSize);
         // If all blocks in the group are released, we will do range marking
         // which is fast. Otherwise, we will wait until we have accumulated
         // a certain amount of free memory.
         const bool ReachReleaseDelta =
-            MayHaveReleasedAll ? true
-                               : PushedBytesDelta * BlockSize >=
-                                     PageSize * SmallerBlockReleasePageDelta;
+            MayHaveReleasedAll
+                ? true
+                : PushedBytesDelta >= PageSize * SmallerBlockReleasePageDelta;
+
+        if (!HighDensity) {
+          DCHECK_LE(BytesInBG, ReleaseThreshold);
+          // The following is the usage of a memroy group,
+          //
+          //     BytesInBG             ReleaseThreshold
+          //  /             \                 v
+          //  +---+---------------------------+-----+
+          //  |   |         |                 |     |
+          //  +---+---------------------------+-----+
+          //       \        /                       ^
+          //    PushedBytesDelta                 GroupEnd
+          MinDistToThreshold =
+              Min(MinDistToThreshold,
+                  ReleaseThreshold - BytesInBG + PushedBytesDelta);
+        } else {
+          // If it reaches high density at this round, the next time we will try
+          // to release is based on SmallerBlockReleasePageDelta
+          MinDistToThreshold =
+              Min(MinDistToThreshold, PageSize * SmallerBlockReleasePageDelta);
+        }
 
         if (!HighDensity || !ReachReleaseDelta) {
           Prev = BG;
@@ -976,6 +1020,16 @@ template <typename Config> class SizeClassAllocator64 {
       GroupToRelease.push_back(Cur);
     }
 
+    // Only small blocks have the adaptive `TryReleaseThreshold`.
+    if (isSmallBlock(BlockSize)) {
+      // If the MinDistToThreshold is not updated, that means each memory group
+      // may have only pushed less than a page size. In that case, just set it
+      // back to normal.
+      if (MinDistToThreshold == GroupSize)
+        MinDistToThreshold = PageSize * SmallerBlockReleasePageDelta;
+      Region->TryReleaseThreshold = MinDistToThreshold;
+    }
+
     if (GroupToRelease.empty())
       return 0;
 

From fe27495be2040007c7b20844a9371b06156ab405 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Thu, 29 Dec 2022 12:11:38 -0800
Subject: [PATCH 046/208] [MemProf] Context disambiguation cloning pass [patch
 1b/3]

Adds support for building the graph in ThinLTO from MemProf summaries.

Follow-on patches will contain the support for cloning on the graph and
in the IR.

Depends on D140908.

Differential Revision: https://reviews.llvm.org/D145836
---
 llvm/include/llvm/IR/ModuleSummaryIndex.h     |  10 +
 .../IPO/MemProfContextDisambiguation.h        |   7 +
 llvm/lib/LTO/LTO.cpp                          |  16 +-
 .../IPO/MemProfContextDisambiguation.cpp      | 221 +++++++++-
 llvm/test/ThinLTO/X86/memprof-basic.ll        | 157 +++++++
 .../X86/memprof-duplicate-context-ids.ll      | 229 ++++++++++
 .../X86/memprof-duplicate-context-ids2.ll     | 390 ++++++++++++++++++
 llvm/test/ThinLTO/X86/memprof-indirectcall.ll | 266 ++++++++++++
 llvm/test/ThinLTO/X86/memprof-inlined.ll      | 186 +++++++++
 llvm/test/ThinLTO/X86/memprof-inlined2.ll     | 124 ++++++
 10 files changed, 1599 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/memprof-basic.ll
 create mode 100644 llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
 create mode 100644 llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
 create mode 100644 llvm/test/ThinLTO/X86/memprof-indirectcall.ll
 create mode 100644 llvm/test/ThinLTO/X86/memprof-inlined.ll
 create mode 100644 llvm/test/ThinLTO/X86/memprof-inlined2.ll

diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 18853102799b4..0c178ccef3bbb 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -988,12 +988,22 @@ class FunctionSummary : public GlobalValueSummary {
     return {};
   }
 
+  CallsitesTy &mutableCallsites() {
+    assert(Callsites);
+    return *Callsites;
+  }
+
   ArrayRef<AllocInfo> allocs() const {
     if (Allocs)
       return *Allocs;
     return {};
   }
 
+  AllocsTy &mutableAllocs() {
+    assert(Allocs);
+    return *Allocs;
+  }
+
   friend struct GraphTraits<ValueInfo>;
 };
 
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index 56e56ed67f7df..475ea48cca932 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -19,9 +19,12 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/PassManager.h"
+#include <functional>
 
 namespace llvm {
+class GlobalValueSummary;
 class Module;
+class ModuleSummaryIndex;
 
 class MemProfContextDisambiguation
     : public PassInfoMixin<MemProfContextDisambiguation> {
@@ -32,6 +35,10 @@ class MemProfContextDisambiguation
   MemProfContextDisambiguation() {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  void run(ModuleSummaryIndex &Index,
+           function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+               isPrevailing);
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 1f273a8e5025f..ee6b8c3aa234d 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -51,6 +51,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
@@ -75,6 +76,9 @@ cl::opt<bool> EnableLTOInternalization(
     cl::desc("Enable global value internalization in LTO"));
 }
 
+/// Enable MemProf context disambiguation for thin link.
+extern cl::opt<bool> EnableMemProfContextDisambiguation;
+
 // Computes a unique hash for the Module considering the current list of
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
@@ -1539,6 +1543,14 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
   runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs,
                                LocalWPDTargetsMap);
 
+  auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
+    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
+  };
+  if (EnableMemProfContextDisambiguation) {
+    MemProfContextDisambiguation ContextDisambiguation;
+    ContextDisambiguation.run(ThinLTO.CombinedIndex, isPrevailing);
+  }
+
   if (Conf.OptLevel > 0)
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                              ImportLists, ExportLists);
@@ -1580,10 +1592,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
   updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported,
                            LocalWPDTargetsMap);
 
-  auto isPrevailing = [&](GlobalValue::GUID GUID,
-                          const GlobalValueSummary *S) {
-    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
-  };
   thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported,
                                       isPrevailing);
 
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5a6625743eecf..b2fcea1ec8694 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -14,9 +14,9 @@
 // subsequently annotated with an attribute for later transformation.
 //
 // The transformations can be performed either directly on IR (regular LTO), or
-// (eventually) on a ThinLTO index (later applied to the IR during the ThinLTO
-// backend). Both types of LTO operate on a the same base graph representation,
-// which uses CRTP to support either IR or Index formats.
+// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
+// Both types of LTO operate on a the same base graph representation, which
+// uses CRTP to support either IR or Index formats.
 //
 //===----------------------------------------------------------------------===//
 
@@ -28,9 +28,11 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -458,6 +460,56 @@ class ModuleCallsiteContextGraph
   const Module &Mod;
 };
 
+/// Represents a call in the summary index graph, which can either be an
+/// allocation or an interior callsite node in an allocation's context.
+/// Holds a pointer to the corresponding data structure in the index.
+struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
+  IndexCall() : PointerUnion() {}
+  IndexCall(std::nullptr_t) : IndexCall() {}
+  IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
+  IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
+
+  IndexCall *operator->() { return this; }
+
+  void print(raw_ostream &OS) const {
+    if (auto *AI = dyn_cast<AllocInfo *>())
+      OS << *AI;
+    else {
+      auto *CI = dyn_cast<CallsiteInfo *>();
+      assert(CI);
+      OS << *CI;
+    }
+  }
+};
+
+/// CRTP derived class for graphs built from summary index (ThinLTO).
+class IndexCallsiteContextGraph
+    : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+                                  IndexCall> {
+public:
+  IndexCallsiteContextGraph(
+      ModuleSummaryIndex &Index,
+      function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+          isPrevailing);
+
+private:
+  friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+                              IndexCall>;
+
+  uint64_t getStackId(uint64_t IdOrIndex) const;
+  bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
+  uint64_t getLastStackId(IndexCall &Call);
+  std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
+  std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
+                       unsigned CloneNo) const;
+
+  // Saves mapping from function summaries containing memprof records back to
+  // its VI, for use in checking and debugging.
+  std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
+
+  const ModuleSummaryIndex &Index;
+};
+
 namespace {
 
 struct FieldSeparator {
@@ -475,6 +527,20 @@ raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
   return OS << FS.Sep;
 }
 
+// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
+// type we should actually use on the corresponding allocation.
+// If we can't clone a node that has NotCold+Cold alloc type, we will fall
+// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
+// from NotCold.
+AllocationType allocTypeToUse(uint8_t AllocTypes) {
+  assert(AllocTypes != (uint8_t)AllocationType::None);
+  if (AllocTypes ==
+      ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
+    return AllocationType::NotCold;
+  else
+    return (AllocationType)AllocTypes;
+}
+
 } // end anonymous namespace
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
@@ -1118,6 +1184,20 @@ uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
   return CallsiteContext.back();
 }
 
+uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
+  assert(Call.is<CallsiteInfo *>());
+  CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
+      CallsiteContext(Call.dyn_cast<CallsiteInfo *>());
+  // Need to convert index into stack id.
+  return Index.getStackIdAtIndex(CallsiteContext.back());
+}
+
+static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
+  if (!CloneNo)
+    return Base.str();
+  return (Base + ".memprof." + Twine(CloneNo)).str();
+}
+
 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
                                                  const Instruction *Call,
                                                  unsigned CloneNo) const {
@@ -1126,6 +1206,22 @@ std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
       .str();
 }
 
+std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
+                                                const IndexCall &Call,
+                                                unsigned CloneNo) const {
+  auto VI = FSToVIMap.find(Func);
+  assert(VI != FSToVIMap.end());
+  if (Call.is<AllocInfo *>())
+    return (VI->second.name() + " -> alloc").str();
+  else {
+    auto *Callsite = Call.dyn_cast<CallsiteInfo *>();
+    return (VI->second.name() + " -> " +
+            getMemProfFuncName(Callsite->Callee.name(),
+                               Callsite->Clones[CloneNo]))
+        .str();
+  }
+}
+
 std::vector<uint64_t>
 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
     Instruction *Call) {
@@ -1135,6 +1231,16 @@ ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
       CallsiteContext);
 }
 
+std::vector<uint64_t>
+IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
+  assert(Call.is<CallsiteInfo *>());
+  CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
+      CallsiteContext(Call.dyn_cast<CallsiteInfo *>());
+  return getStackIdsWithContextNodes<CallsiteInfo,
+                                     SmallVector<unsigned>::const_iterator>(
+      CallsiteContext);
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 template <class NodeT, class IteratorT>
 std::vector<uint64_t>
@@ -1207,6 +1313,84 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
 }
 
+IndexCallsiteContextGraph::IndexCallsiteContextGraph(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing)
+    : Index(Index) {
+  for (auto &I : Index) {
+    auto VI = Index.getValueInfo(I);
+    for (auto &S : VI.getSummaryList()) {
+      // We should only add the prevailing nodes. Otherwise we may try to clone
+      // in a weak copy that won't be linked (and may be different than the
+      // prevailing version).
+      // We only keep the memprof summary on the prevailing copy now when
+      // building the combined index, as a space optimization, however don't
+      // rely on this optimization. The linker doesn't resolve local linkage
+      // values so don't check whether those are prevailing.
+      if (!GlobalValue::isLocalLinkage(S->linkage()) &&
+          !isPrevailing(VI.getGUID(), S.get()))
+        continue;
+      auto *FS = dyn_cast<FunctionSummary>(S.get());
+      if (!FS)
+        continue;
+      std::vector<CallInfo> CallsWithMetadata;
+      if (!FS->allocs().empty()) {
+        for (auto &AN : FS->mutableAllocs()) {
+          // This can happen because of recursion elimination handling that
+          // currently exists in ModuleSummaryAnalysis. Skip these for now.
+          // We still added them to the summary because we need to be able to
+          // correlate properly in applyImport in the backends.
+          if (AN.MIBs.empty())
+            continue;
+          CallsWithMetadata.push_back({&AN});
+          auto *AllocNode = addAllocNode({&AN}, FS);
+          // Pass an empty CallStack to the CallsiteContext (second)
+          // parameter, since for ThinLTO we already collapsed out the inlined
+          // stack ids on the allocation call during ModuleSummaryAnalysis.
+          CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
+              EmptyContext;
+          // Now add all of the MIBs and their stack nodes.
+          for (auto &MIB : AN.MIBs) {
+            CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
+                StackContext(&MIB);
+            addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
+                AllocNode, StackContext, EmptyContext, MIB.AllocType);
+          }
+          assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
+          // Initialize version 0 on the summary alloc node to the current alloc
+          // type, unless it has both types in which case make it default, so
+          // that in the case where we aren't able to clone the original version
+          // always ends up with the default allocation behavior.
+          AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
+        }
+      }
+      // For callsite metadata, add to list for this function for later use.
+      if (!FS->callsites().empty())
+        for (auto &SN : FS->mutableCallsites())
+          CallsWithMetadata.push_back({&SN});
+
+      if (!CallsWithMetadata.empty())
+        FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata});
+
+      if (!FS->allocs().empty() || !FS->callsites().empty())
+        FSToVIMap[FS] = VI;
+    }
+  }
+
+  if (DumpCCG) {
+    dbgs() << "CCG before updating call stack chains:\n";
+    dbgs() << *this;
+  }
+
+  if (ExportToDot)
+    exportToDot("prestackupdate");
+
+  updateStackNodes();
+
+  handleCallsitesWithMultipleTargets();
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy,
                           CallTy>::handleCallsitesWithMultipleTargets() {
@@ -1251,6 +1435,12 @@ uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
   return IdOrIndex;
 }
 
+uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
+  // In the Index case this is an index into the stack id list in the summary
+  // index, convert it to an Id.
+  return Index.getStackIdAtIndex(IdOrIndex);
+}
+
 bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
                                                    const Function *Func) {
   auto *CB = dyn_cast<CallBase>(Call);
@@ -1264,6 +1454,23 @@ bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
   return Alias && Alias->getAliasee() == Func;
 }
 
+bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call,
+                                                  const FunctionSummary *Func) {
+  ValueInfo Callee = Call.dyn_cast<CallsiteInfo *>()->Callee;
+  // If there is no summary list then this is a call to an externally defined
+  // symbol.
+  AliasSummary *Alias =
+      Callee.getSummaryList().empty()
+          ? nullptr
+          : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
+  assert(FSToVIMap.count(Func));
+  return Callee == FSToVIMap[Func] ||
+         // If callee is an alias, check the aliasee, since only function
+         // summary base objects will contain the stack node summaries and thus
+         // get a context node.
+         (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]);
+}
+
 static std::string getAllocTypeString(uint8_t AllocTypes) {
   if (!AllocTypes)
     return "None";
@@ -1581,3 +1788,11 @@ PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
+
+void MemProfContextDisambiguation::run(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing) {
+  IndexCallsiteContextGraph CCG(Index, isPrevailing);
+  CCG.process();
+}
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
new file mode 100644
index 0000000000000..d8c78d270f277
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -0,0 +1,157 @@
+;; Test callsite context graph generation for simple call graph with
+;; two memprof contexts and no inlining.
+;;
+;; Original code looks like:
+;;
+;; char *bar() {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:	-r=%t.o,main,plx \
+; RUN:	-r=%t.o,_ZdaPv, \
+; RUN:	-r=%t.o,sleep, \
+; RUN:	-r=%t.o,_Znam, \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "memprof-basic.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !0
+  %call1 = call ptr @_Z3foov(), !callsite !1
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+define internal ptr @_Z3barv() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() {
+entry:
+  %call = call ptr @_Z3barv(), !callsite !8
+  ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+  %call = call ptr @_Z3bazv(), !callsite !9
+  ret ptr null
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 1, 0 }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!5 = !{!6, !"cold"}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 -5964873800580613432}
+!9 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 2, 3, 0
+; DUMP: 		AllocType 2 StackIds: 2, 3, 1
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[BAZ]]
+; DUMP: 	Callee: 9832687305761716512 (_Z3barv) Clones: 0 StackIds: 2	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[FOO]]
+; DUMP: 	Callee: 5878270615442837395 (_Z3bazv) Clones: 0 StackIds: 3	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 0	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 1	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOT: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: }
diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
new file mode 100644
index 0000000000000..772b319e0715e
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
@@ -0,0 +1,229 @@
+;; Test callsite context graph generation for call graph with with MIBs
+;; that have pruned contexts that partially match multiple inlined
+;; callsite contexts, requiring duplication of context ids and nodes
+;; while matching callsite nodes onto the graph.
+;;
+;; Original code looks like:
+;;
+;; char *D() {
+;;   return new char[10];
+;; }
+;;
+;; char *F() {
+;;   return D();
+;; }
+;;
+;; char *C() {
+;;   return D();
+;; }
+;;
+;; char *B() {
+;;   return C();
+;; }
+;;
+;; char *E() {
+;;   return C();
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = B(); // cold
+;;   char *y = E(); // cold
+;;   char *z = F(); // default
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   delete[] z;
+;;   sleep(10);
+;;   delete[] x;
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of C into both B and E.
+;; Since both allocation contexts via C are cold, the matched memprof
+;; metadata has the context pruned above C's callsite. This requires
+;; matching the stack node for C to callsites where it was inlined (i.e.
+;; the callsites in B and E that have callsite metadata that includes C's).
+;; It also requires duplication of that node in the graph as well as the
+;; duplication of the context ids along that path through the graph,
+;; so that we can represent the duplicated (via inlining) C callsite.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
+
+
+source_filename = "duplicate-context-ids.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal ptr @_Z1Dv() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z1Fv() {
+entry:
+  %call = call ptr @_Z1Dv(), !callsite !6
+  ret ptr null
+}
+
+define internal ptr @_Z1Cv() {
+entry:
+  %call = call ptr @_Z1Dv(), !callsite !7
+  ret ptr null
+}
+
+define internal ptr @_Z1Bv() {
+entry:
+  %call.i = call ptr @_Z1Dv(), !callsite !8
+  ret ptr null
+}
+
+define internal ptr @_Z1Ev() {
+entry:
+  %call.i = call ptr @_Z1Dv(), !callsite !9
+  ret ptr null
+}
+
+declare i32 @main()
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"cold"}
+!2 = !{i64 6541423618768552252, i64 -6270142974039008131}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 6541423618768552252, i64 -4903163940066524832}
+!5 = !{i64 6541423618768552252}
+!6 = !{i64 -4903163940066524832}
+!7 = !{i64 -6270142974039008131}
+!8 = !{i64 -6270142974039008131, i64 -184525619819294889}
+!9 = !{i64 -6270142974039008131, i64 1905834578520680781}
+
+
+;; After adding only the alloc node memprof metadata, we only have 2 contexts.
+
+; DUMP: CCG before updating call stack chains:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 2 StackIds: 0
+; DUMP: 		AllocType 1 StackIds: 1
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+
+; DUMP: Node [[C]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[F]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+;; After updating for callsite metadata, we should have generated context ids 3 and 4,
+;; along with 2 new nodes for those callsites. All have the same allocation type
+;; behavior as the original C node.
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 2 StackIds: 0
+; DUMP: 		AllocType 1 StackIds: 1
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3
+; DUMP: 		Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+
+; DUMP: Node [[F]]
+; DUMP: 	Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 1	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[C2]]
+; DUMP: 	Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: 	Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 2	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[E]]
+; DUMP: 	Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 3	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+
+; DOTPRE: digraph "prestackupdate" {
+; DOTPRE: 	label="prestackupdate";
+; DOTPRE: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
+; DOTPRE: 	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12176601099670543485\nnull call (external)}"];
+; DOTPRE: 	Node[[C]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
+; DOTPRE: 	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\nnull call (external)}"];
+; DOTPRE: 	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
+; DOTPRE: }
+
+
+; DOTPOST:digraph "postbuild" {
+; DOTPOST:	label="postbuild";
+; DOTPOST:	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
+; DOTPOST:	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"];
+; DOTPOST:	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
+; DOTPOST:	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"];
+; DOTPOST:	Node[[C]] -> Node[[D]][tooltip="ContextIds: 3",fillcolor="cyan"];
+; DOTPOST:	Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"];
+; DOTPOST:	Node[[B]] -> Node[[D]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOTPOST:	Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"];
+; DOTPOST:	Node[[E]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
+; DOTPOST:}
diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
new file mode 100644
index 0000000000000..af7dece9421a9
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
@@ -0,0 +1,390 @@
+;; Test callsite context graph generation for call graph with with MIBs
+;; that have pruned contexts that partially match multiple inlined
+;; callsite contexts, requiring duplication of context ids and nodes
+;; while matching callsite nodes onto the graph. This test requires more
+;; complex duplication due to multiple contexts for different allocations
+;; that share some of the same callsite nodes.
+;;
+;; Original code looks like:
+;;
+;; char *D(bool Call1) {
+;;   if (Call1)
+;;     return new char[10];
+;;   else
+;;     return new char[10];
+;; }
+;;
+;; char *C(bool Call1) {
+;;   return D(Call1);
+;; }
+;;
+;; char *B(bool Call1) {
+;;   if (Call1)
+;;     return C(true);
+;;   else
+;;     return C(false);
+;; }
+;;
+;; char *A(bool Call1) {
+;;   return B(Call1);
+;; }
+;;
+;; char *A1() {
+;;   return A(true);
+;; }
+;;
+;; char *A2() {
+;;   return A(true);
+;; }
+;;
+;; char *A3() {
+;;   return A(false);
+;; }
+;;
+;; char *A4() {
+;;   return A(false);
+;; }
+;;
+;; char *E() {
+;;   return B(true);
+;; }
+;;
+;; char *F() {
+;;   return B(false);
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *a1 = A1(); // cold
+;;   char *a2 = A2(); // cold
+;;   char *e = E(); // default
+;;   char *a3 = A3(); // default
+;;   char *a4 = A4(); // default
+;;   char *f = F(); // cold
+;;   memset(a1, 0, 10);
+;;   memset(a2, 0, 10);
+;;   memset(e, 0, 10);
+;;   memset(a3, 0, 10);
+;;   memset(a4, 0, 10);
+;;   memset(f, 0, 10);
+;;   delete[] a3;
+;;   delete[] a4;
+;;   delete[] e;
+;;   sleep(10);
+;;   delete[] a1;
+;;   delete[] a2;
+;;   delete[] f;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of A into its callers,
+;; without any other inlining or optimizations. Since both allocation contexts
+;; via A for each allocation in D have the same allocation type (cold via
+;; A1 and A2 for the first new in D, and non-cold via A3 and A4 for the second
+;; new in D, the contexts for those respective allocations are pruned above A.
+;; The allocations via E and F are to ensure we don't prune above B.
+;;
+;; The matching onto the inlined A[1234]->A sequences will require duplication
+;; of the context id assigned to the context from A for each allocation in D.
+;; This test ensures that we do this correctly in the presence of callsites
+;; shared by the different duplicated context ids (i.e. callsite in C).
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Z1Db,plx \
+; RUN:  -r=%t.o,_Z1Cb,plx \
+; RUN:  -r=%t.o,_Z1Bb,plx \
+; RUN:  -r=%t.o,_Z1Ab,plx \
+; RUN:  -r=%t.o,_Z2A1v,plx \
+; RUN:  -r=%t.o,_Z2A2v,plx \
+; RUN:  -r=%t.o,_Z2A3v,plx \
+; RUN:  -r=%t.o,_Z2A4v,plx \
+; RUN:  -r=%t.o,_Z1Ev,plx \
+; RUN:  -r=%t.o,_Z1Fv,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Db(i1 %Call1) {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+  br label %return
+
+if.else:                                          ; No predecessors!
+  %call1 = call ptr @_Znam(i64 0), !memprof !6, !callsite !11
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define ptr @_Z1Cb(i1 %Call1) {
+entry:
+  %call = call ptr @_Z1Db(i1 false), !callsite !12
+  ret ptr null
+}
+
+define ptr @_Z1Bb(i1 %Call1) {
+entry:
+  %call = call ptr @_Z1Cb(i1 false), !callsite !13
+  br label %return
+
+if.else:                                          ; No predecessors!
+  %call1 = call ptr @_Z1Cb(i1 false), !callsite !14
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Ab() {
+entry:
+  %call = call ptr @_Z1Bb(i1 false), !callsite !15
+  ret ptr null
+}
+
+define ptr @_Z2A1v() {
+entry:
+  %call.i = call ptr @_Z1Bb(i1 false), !callsite !16
+  ret ptr null
+}
+
+define ptr @_Z2A2v() {
+entry:
+  %call.i = call ptr @_Z1Bb(i1 false), !callsite !17
+  ret ptr null
+}
+
+define ptr @_Z2A3v() {
+entry:
+  %call.i = call ptr @_Z1Bb(i1 false), !callsite !18
+  ret ptr null
+}
+
+define ptr @_Z2A4v() {
+entry:
+  %call.i = call ptr @_Z1Bb(i1 false), !callsite !19
+  ret ptr null
+}
+
+define ptr @_Z1Ev() {
+entry:
+  %call = call ptr @_Z1Bb(i1 false), !callsite !20
+  ret ptr null
+}
+
+define ptr @_Z1Fv() {
+entry:
+  %call = call ptr @_Z1Bb(i1 false), !callsite !21
+  ret ptr null
+}
+
+declare i32 @main()
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0 }
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 1905834578520680781}
+!3 = !{!4, !"cold"}
+!4 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 -6528110295079665978}
+!5 = !{i64 4854880825882961848}
+!6 = !{!7, !9}
+!7 = !{!8, !"notcold"}
+!8 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -6528110295079665978}
+!9 = !{!10, !"cold"}
+!10 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -4903163940066524832}
+!11 = !{i64 -8775068539491628272}
+!12 = !{i64 -904694911315397047}
+!13 = !{i64 6532298921261778285}
+!14 = !{i64 7859682663773658275}
+!15 = !{i64 -6528110295079665978}
+!16 = !{i64 -6528110295079665978, i64 5747919905719679568}
+!17 = !{i64 -6528110295079665978, i64 -5753238080028016843}
+!18 = !{i64 -6528110295079665978, i64 1794685869326395337}
+!19 = !{i64 -6528110295079665978, i64 5462047985461644151}
+!20 = !{i64 1905834578520680781}
+!21 = !{i64 -4903163940066524832}
+
+
+;; After adding only the alloc node memprof metadata, we only have 4 contexts (we only
+;; match the interesting parts of the pre-update graph here).
+
+; DUMP: CCG before updating call stack chains:
+; DUMP: Callsite Context Graph:
+
+; DUMP: Node [[D1:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 0, 1, 2
+; DUMP:                 AllocType 2 StackIds: 0, 1, 3
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+
+; DUMP: Node [[C:0x[a-z0-9]+]]
+; DUMP:         null Call
+; DUMP:         AllocTypes: NotColdCold
+; DUMP:         ContextIds: 1 2 3 4
+; DUMP:         CalleeEdges:
+; DUMP:                 Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP:                 Edge from Callee [[D2:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4
+
+; DUMP: Node [[D2]]
+; DUMP: Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 0, 4, 3
+; DUMP:                 AllocType 2 StackIds: 0, 4, 5
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+
+
+;; After updating for callsite metadata, we should have duplicated the context
+;; ids coming from node A (2 and 3) 4 times, for the 4 different callers of A,
+;; and used those on new nodes for those callers. Note that while in reality
+;; we only have cold edges coming from A1 and A2 and noncold from A3 and A4,
+;; due to the pruning we have lost this information and thus end up duplicating
+;; both of A's contexts to all of the new nodes (which could result in some
+;; unnecessary cloning.
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D1]]
+; DUMP: Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 0, 1, 2
+; DUMP:                 AllocType 2 StackIds: 0, 1, 3
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 5 7 9 11
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+
+; DUMP: Node [[C]]
+; DUMP: 	Callee: 11485875876353461977 (_Z1Db) Clones: 0 StackIds: 0      (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4 5 6 7 8 9 10 11 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 		Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B1:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+
+; DUMP: Node [[B1]]
+; DUMP: 	Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 1      (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 5 7 9 11
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B1]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 5
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A3:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 7
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A1:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 9
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 11
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[E]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 2       (clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[E]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[D2]]
+; DUMP: Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 0, 4, 3
+; DUMP:                 AllocType 2 StackIds: 0, 4, 5
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4 6 8 10 12
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+
+; DUMP: Node [[B2]]
+; DUMP: 	Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 4      (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4 6 8 10 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B2]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
+
+; DUMP: Node [[F]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 5       (clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[F]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A2]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 5 6
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A2]] AllocTypes: Cold ContextIds: 5
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A3]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 8    (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 7 8
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A3]] AllocTypes: Cold ContextIds: 7
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A1]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 9 10
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A1]] AllocTypes: Cold ContextIds: 9
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A4]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 9    (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 11 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A4]] AllocTypes: Cold ContextIds: 11
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 6    (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 2 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	CallerEdges:
diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
new file mode 100644
index 0000000000000..30c8bd27f37b7
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
@@ -0,0 +1,266 @@
+;; Tests callsite context graph generation for call graph containing indirect
+;; calls. Currently this should result in conservative behavior, such that the
+;; indirect call receives a null call in its graph node, to prevent subsequent
+;; cloning.
+;;
+;; Original code looks like:
+;;
+;; char *foo() {
+;;   return new char[10];
+;; }
+;; class A {
+;; public:
+;;     virtual char *x() { return foo(); }
+;; };
+;; class B : public A {
+;; public:
+;;     char *x() final { return foo(); }
+;; };
+;; char *bar(A *a) {
+;;   return a->x();
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   B b;
+;;   char *z = bar(&b);
+;;   char *w = bar(&b);
+;;   A a;
+;;   char *r = bar(&a);
+;;   char *s = bar(&a);
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   memset(w, 0, 10);
+;;   memset(r, 0, 10);
+;;   memset(s, 0, 10);
+;;   delete[] x;
+;;   delete[] w;
+;;   delete[] r;
+;;   sleep(10);
+;;   delete[] y;
+;;   delete[] z;
+;;   delete[] s;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; Compiled without optimization to prevent inlining and devirtualization.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \
+; RUN:  -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "indirectcall.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr
+@_ZTVN10__cxxabiv117__class_type_infoE = external global ptr
+
+define internal ptr @_Z3barP1A(ptr %a) {
+entry:
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !0
+  %call1 = call ptr @_Z3foov(), !callsite !1
+  %call2 = call ptr @_Z3barP1A(ptr null), !callsite !2
+  %call3 = call ptr @_Z3barP1A(ptr null), !callsite !3
+  %call4 = call ptr @_Z3barP1A(ptr null), !callsite !4
+  %call5 = call ptr @_Z3barP1A(ptr null), !callsite !5
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+define internal ptr @_ZN1A1xEv() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !6
+  ret ptr null
+}
+
+define internal ptr @_ZN1B1xEv() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !7
+  ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !8, !callsite !21
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{i64 6792096022461663180}
+!3 = !{i64 -2709642582978494015}
+!4 = !{i64 748269490701775343}
+!5 = !{i64 -5747251260480066785}
+!6 = !{i64 8256774051149711748}
+!7 = !{i64 -4831879094954754638}
+!8 = !{!9, !11, !13, !15, !17, !19}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343}
+!11 = !{!12, !"cold"}
+!12 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 2732490490862098848, i64 8632435727821051414}
+!15 = !{!16, !"cold"}
+!16 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180}
+!17 = !{!18, !"notcold"}
+!18 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015}
+!19 = !{!20, !"cold"}
+!20 = !{i64 2732490490862098848, i64 -3421689549917153178}
+!21 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[FOO:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 6, 8, 4
+; DUMP: 		AllocType 2 StackIds: 6, 8, 5
+; DUMP: 		AllocType 1 StackIds: 0
+; DUMP: 		AllocType 2 StackIds: 7, 8, 2
+; DUMP: 		AllocType 1 StackIds: 7, 8, 3
+; DUMP: 		AllocType 2 StackIds: 1
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4 5 6
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6
+
+; DUMP: Node [[AX]]
+; DUMP: 	Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 6	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+;; Bar contains an indirect call, with multiple targets. It's call should be null.
+; DUMP: Node [[BAR]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 4 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5
+
+; DUMP: Node [[MAIN3]]
+; DUMP: 	Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 4	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN4]]
+; DUMP: 	Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 5	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 0	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[BX]]
+; DUMP: 	Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 4 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
+
+; DUMP: Node [[MAIN5]]
+; DUMP: 	Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 2	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN6]]
+; DUMP: 	Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 3	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 1	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 6
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
+; DOT: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
+; DOT: 	Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
+; DOT: 	Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"];
+; DOT: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
+; DOT: 	Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOT: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"];
+; DOT: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"];
+; DOT: }
diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll
new file mode 100644
index 0000000000000..89cd878e99fb4
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll
@@ -0,0 +1,186 @@
+;; Test callsite context graph generation for call graph with two memprof
+;; contexts and partial inlining, requiring generation of a new fused node to
+;; represent the inlined sequence while matching callsite nodes onto the graph.
+;;
+;; Original code looks like:
+;;
+;; char *bar() {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of baz into foo, and
+;; bar into baz. Due to the inlining of bar we will initially have two
+;; allocation nodes in the graph. This tests that we correctly match
+;; foo (with baz inlined) onto the graph nodes first, and generate a new
+;; fused node for it. We should then not match baz (with bar inlined) as that
+;; is not reached by the MIB contexts (since all calls from main will look
+;; like main -> foo(+baz) -> bar after the inlining reflected in this IR).
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:	-r=%t.o,main,plx \
+; RUN:	-r=%t.o,_ZdaPv, \
+; RUN:	-r=%t.o,sleep, \
+; RUN:	-r=%t.o,_Znam, \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "inlined.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal ptr @_Z3barv() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() {
+entry:
+  %call.i = call ptr @_Znam(i64 0), !memprof !0, !callsite !6
+  ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+  %call.i = call ptr @_Z3barv(), !callsite !7
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !8
+  %call1 = call ptr @_Z3foov(), !callsite !9
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!5 = !{i64 9086428284934609951}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432}
+!7 = !{i64 -5964873800580613432, i64 2732490490862098848}
+!8 = !{i64 8632435727821051414}
+!9 = !{i64 -3421689549917153178}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+
+; DUMP: Node [[BAZ:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 1, 2
+; DUMP: 		AllocType 2 StackIds: 1, 3
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+;; This is leftover from the MIB on the alloc inlined into baz. It is not
+;; matched with any call, since there is no such node in the IR. Due to the
+;; null call it will not participate in any context transformations.
+; DUMP: Node [[FOO2]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 2	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO:0x[a-z0-9]+]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 3	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 0, 1, 2
+; DUMP: 		AllocType 2 StackIds: 0, 1, 3
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4
+
+;; This is the node synthesized for the call to bar in foo that was created
+;; by inlining baz into foo.
+; DUMP: Node [[FOO]]
+; DUMP: 	Callee: 16064618363798697104 (_Z3barv) Clones: 0 StackIds: 0, 1	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
+; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 3",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc2\n_Z3barv -\> alloc}"];
+; DOT: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOT: 	Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"];
+; DOT: }
diff --git a/llvm/test/ThinLTO/X86/memprof-inlined2.ll b/llvm/test/ThinLTO/X86/memprof-inlined2.ll
new file mode 100644
index 0000000000000..1ffae8cd59cef
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-inlined2.ll
@@ -0,0 +1,124 @@
+;; Test callsite context graph generation for call graph with two memprof
+;; contexts and multiple levels of inlining, requiring generation of new
+;; fused nodes to represent the inlined sequence while matching callsite
+;; nodes onto the graph. In particular this tests the case where a function
+;; has inlined a callee containing an inlined callee.
+;;
+;; Original code looks like:
+;;
+;; char *bar() __attribute__((noinline)) {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; Both foo and baz are inlined into main, at both foo callsites.
+;; We should update the graph for new fused nodes for both of those inlined
+;; callsites to bar.
+;;
+;; Note that baz and bar are both dead due to the inlining, but have been left
+;; in the input IR to ensure that the MIB call chain is matched to the longer
+;; inline sequences from main.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Z3barv,plx \
+; RUN:  -r=%t.o,_Z3bazv,plx \
+; RUN:  -r=%t.o,_Z3foov,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z3barv() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+declare ptr @_Z3bazv()
+
+declare ptr @_Z3foov()
+
+define i32 @main() {
+delete.end5:
+  %call.i.i = call ptr @_Z3barv(), !callsite !6
+  %call.i.i8 = call ptr @_Z3barv(), !callsite !7
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!5 = !{i64 9086428284934609951}
+!6 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!7 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 0, 1, 2
+; DUMP: 		AllocType 2 StackIds: 0, 1, 3
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+;; This is the node synthesized for the first inlined call chain of main->foo->baz
+; DUMP: Node [[MAIN1]]
+; DUMP: 	Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 2	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+;; This is the node synthesized for the second inlined call chain of main->foo->baz
+; DUMP: Node [[MAIN2]]
+; DUMP: 	Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 3	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:

From 201fdef40dd6ec193d18d39638454a3c972f1fec Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Mon, 20 Mar 2023 18:38:04 -0700
Subject: [PATCH 047/208] libclang: Pass Clang install directory to driver via
 argv[0].

Various driver features, such as the sysroot path detection for Android
targets, rely on being able to find the Clang install directory (look
for callers of `getDriver().getInstalledDir()`). However, the install
directory isn't currently being plumbed through to the driver, which is
conventionally done via the argv[0] passed to the Driver constructor.

It looks like D14695 attempted to fix this by adding another API that
allows specifying the argv[0]. However, rather than requiring every
user of libclang to switch to this API for correct behavior, let's have
the other existing APIs work by default, by using the existing logic in
libclang for finding the install directory.

Differential Revision: https://reviews.llvm.org/D146497
---
 clang/docs/ReleaseNotes.rst                     |  8 ++++++++
 clang/include/clang-c/Index.h                   |  9 +++++++--
 clang/test/Index/record-completion-invocation.c |  2 +-
 clang/test/Index/record-parsing-invocation.c    |  4 ++--
 clang/tools/libclang/CIndex.cpp                 | 11 ++++++++++-
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 005bf99a62457..94e0f10a31743 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -368,6 +368,14 @@ libclang
   has an evaluable bit width. Fixes undefined behavior when called on a
   bit-field whose width depends on a template paramter.
 
+- ``clang_parseTranslationUnit`` and ``clang_parseTranslationUnit2`` have been
+  changed to automatically locate the Clang installation directory relative to
+  the location of the libclang binary and use it for system headers installed
+  alongside the Clang installation. It is no longer necessary to manually
+  locate such system headers or use the ``clang_parseTranslationUnit2FullArgv``
+  function for this purpose if libclang has been installed in the default
+  location.
+ 
 Static Analyzer
 ---------------
 - Fix incorrect alignment attribute on the this parameter of certain
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index c7d32e6a152ae..8275f2941a41c 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -899,8 +899,13 @@ CINDEX_LINKAGE enum CXErrorCode clang_parseTranslationUnit2(
 
 /**
  * Same as clang_parseTranslationUnit2 but requires a full command line
- * for \c command_line_args including argv[0]. This is useful if the standard
- * library paths are relative to the binary.
+ * for \c command_line_args including argv[0].
+ *
+ * This is useful if the driver uses paths relative to the binary and either
+ * you are targeting libclang versions older than Clang 17, or libclang is
+ * installed to a non-standard location. Clang 17 and newer will automatically
+ * use the correct argv[0] if libclang is installed in the lib directory
+ * parallel to the bin directory where the clang binary is installed.
  */
 CINDEX_LINKAGE enum CXErrorCode clang_parseTranslationUnit2FullArgv(
     CXIndex CIdx, const char *source_filename,
diff --git a/clang/test/Index/record-completion-invocation.c b/clang/test/Index/record-completion-invocation.c
index 4b667134fa2d4..75eb9083908ae 100644
--- a/clang/test/Index/record-completion-invocation.c
+++ b/clang/test/Index/record-completion-invocation.c
@@ -9,4 +9,4 @@
 // RUN: env LIBCLANG_DISABLE_CRASH_RECOVERY=1 CINDEXTEST_INVOCATION_EMISSION_PATH=%t not --crash c-index-test -code-completion-at=%s:10:1 "-remap-file=%s,%S/Inputs/record-parsing-invocation-remap.c" %s
 // RUN: cat %t/libclang-* | FileCheck %s
 
-// CHECK: {"toolchain":"{{.*}}","libclang.operation":"complete","libclang.opts":1,"args":["clang","-fno-spell-checking","{{.*}}record-completion-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"],"invocation-args":["-code-completion-at={{.*}}record-completion-invocation.c:10:1"],"unsaved_file_hashes":[{"name":"{{.*}}record-completion-invocation.c","md5":"aee23773de90e665992b48209351d70e"}]}
+// CHECK: {"toolchain":"{{.*}}","libclang.operation":"complete","libclang.opts":1,"args":["{{.*}}bin{{.*}}clang","-fno-spell-checking","{{.*}}record-completion-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"],"invocation-args":["-code-completion-at={{.*}}record-completion-invocation.c:10:1"],"unsaved_file_hashes":[{"name":"{{.*}}record-completion-invocation.c","md5":"aee23773de90e665992b48209351d70e"}]}
diff --git a/clang/test/Index/record-parsing-invocation.c b/clang/test/Index/record-parsing-invocation.c
index e0c4cdb05fb00..f370f014fb1cc 100644
--- a/clang/test/Index/record-parsing-invocation.c
+++ b/clang/test/Index/record-parsing-invocation.c
@@ -25,5 +25,5 @@
 #  pragma clang __debug parser_crash
 #endif
 
-// CHECK: {"toolchain":"{{.*}}","libclang.operation":"parse","libclang.opts":1,"args":["clang","-fno-spell-checking","{{.*}}record-parsing-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"]}
-// CHECK-UNSAVED: {"toolchain":"{{.*}}","libclang.operation":"parse","libclang.opts":1,"args":["clang","-fno-spell-checking","{{.*}}record-parsing-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"],"unsaved_file_hashes":[{"name":"{{.*}}record-parsing-invocation.c","md5":"aee23773de90e665992b48209351d70e"}]}
+// CHECK: {"toolchain":"{{.*}}","libclang.operation":"parse","libclang.opts":1,"args":["{{.*}}bin{{.*}}clang","-fno-spell-checking","{{.*}}record-parsing-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"]}
+// CHECK-UNSAVED: {"toolchain":"{{.*}}","libclang.operation":"parse","libclang.opts":1,"args":["{{.*}}bin{{.*}}clang","-fno-spell-checking","{{.*}}record-parsing-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"],"unsaved_file_hashes":[{"name":"{{.*}}record-parsing-invocation.c","md5":"aee23773de90e665992b48209351d70e"}]}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 30416e46ce173..2aa12667d37e9 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -4013,8 +4013,17 @@ enum CXErrorCode clang_parseTranslationUnit2(
     struct CXUnsavedFile *unsaved_files, unsigned num_unsaved_files,
     unsigned options, CXTranslationUnit *out_TU) {
   noteBottomOfStack();
+
+  if (!CIdx)
+    return CXError_InvalidArguments;
+
+  SmallString<64> ClangPath(
+      static_cast<CIndexer *>(CIdx)->getClangToolchainPath());
+  llvm::sys::path::append(ClangPath, "bin");
+  llvm::sys::path::append(ClangPath, "clang");
+
   SmallVector<const char *, 4> Args;
-  Args.push_back("clang");
+  Args.push_back(ClangPath.c_str());
   Args.append(command_line_args, command_line_args + num_command_line_args);
   return clang_parseTranslationUnit2FullArgv(
       CIdx, source_filename, Args.data(), Args.size(), unsaved_files,

From e7596a99fca6d1df14275f5293e447a4d87af06a Mon Sep 17 00:00:00 2001
From: MalavikaSamak <malavika2@apple.com>
Date: Wed, 22 Mar 2023 15:31:00 -0700
Subject: [PATCH 048/208] [-Wunsafe-buffer-usage] Add Fixable for simple
 pointer dereference

This patch introduces PointerDereferenceGadget, a FixableGadget that emits
fixits to handle cases where a pointer that is identified as unsafe is
dereferenced. The current implementation only handles cases where the strategy
is to change the type of the raw pointer to std::span. The fixit for this
strategy is to fetch the first element from the corresponding span instance.

For example for the code below, the PointerDereferenceGadget emits a fixit for
S3 (S1, S2 are to be handled by other gadgets):

  S1: int *ptr = new int[10];
  S2: int val1 = ptr[k]; // Unsafe operation
  S3: int val2 = *ptr; => Fixit: int val2 = ptr[0];

Differential revision: https://reviews.llvm.org/D143206
---
 .../Analyses/UnsafeBufferUsageGadgets.def     |  1 +
 clang/lib/Analysis/UnsafeBufferUsage.cpp      | 70 +++++++++++++++++++
 ...safe-buffer-usage-fixits-pointer-deref.cpp | 55 +++++++++++++++
 3 files changed, 126 insertions(+)
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pointer-deref.cpp

diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
index 89f7c1ed2ba24..a8485682c1d1f 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
@@ -32,6 +32,7 @@ WARNING_GADGET(PointerArithmetic)
 WARNING_GADGET(UnsafeBufferUsageAttr)
 FIXABLE_GADGET(ULCArraySubscript)
 FIXABLE_GADGET(DerefSimplePtrArithFixable)
+FIXABLE_GADGET(PointerDereference)
 
 #undef FIXABLE_GADGET
 #undef WARNING_GADGET
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 04e11d0471a7d..95e4c8388bc44 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -463,6 +463,45 @@ class ULCArraySubscriptGadget : public FixableGadget {
     return {};
   }
 };
+
+class PointerDereferenceGadget : public FixableGadget {
+  static constexpr const char *const BaseDeclRefExprTag = "BaseDRE";
+  static constexpr const char *const OperatorTag = "op";
+
+  const DeclRefExpr *BaseDeclRefExpr = nullptr;
+  const UnaryOperator *Op = nullptr;
+
+public:
+  PointerDereferenceGadget(const MatchFinder::MatchResult &Result)
+      : FixableGadget(Kind::PointerDereference),
+        BaseDeclRefExpr(
+            Result.Nodes.getNodeAs<DeclRefExpr>(BaseDeclRefExprTag)),
+        Op(Result.Nodes.getNodeAs<UnaryOperator>(OperatorTag)) {}
+
+  static bool classof(const Gadget *G) {
+    return G->getKind() == Kind::PointerDereference;
+  }
+
+  static Matcher matcher() {
+    auto Target =
+        unaryOperator(
+            hasOperatorName("*"),
+            has(expr(ignoringParenImpCasts(
+                declRefExpr(to(varDecl())).bind(BaseDeclRefExprTag)))))
+            .bind(OperatorTag);
+
+    return expr(isInUnspecifiedLvalueContext(Target));
+  }
+
+  DeclUseList getClaimedVarUseSites() const override {
+    return {BaseDeclRefExpr};
+  }
+
+  virtual const Stmt *getBaseStmt() const final { return Op; }
+
+  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+};
+
 } // namespace
 
 namespace {
@@ -914,6 +953,37 @@ DerefSimplePtrArithFixableGadget::getFixits(const Strategy &s) const {
   return std::nullopt; // something wrong or unsupported, give up
 }
 
+std::optional<FixItList>
+PointerDereferenceGadget::getFixits(const Strategy &S) const {
+  const VarDecl *VD = cast<VarDecl>(BaseDeclRefExpr->getDecl());
+  switch (S.lookup(VD)) {
+  case Strategy::Kind::Span: {
+    ASTContext &Ctx = VD->getASTContext();
+    SourceManager &SM = Ctx.getSourceManager();
+    // Required changes: *(ptr); => (ptr[0]); and *ptr; => ptr[0]
+    // Deletes the *operand
+    CharSourceRange derefRange = clang::CharSourceRange::getCharRange(
+        Op->getBeginLoc(), Op->getBeginLoc().getLocWithOffset(1));
+    // Inserts the [0]
+    std::optional<SourceLocation> endOfOperand =
+        getEndCharLoc(BaseDeclRefExpr, SM, Ctx.getLangOpts());
+    if (endOfOperand) {
+      return FixItList{{FixItHint::CreateRemoval(derefRange),
+                        FixItHint::CreateInsertion(
+                            endOfOperand.value().getLocWithOffset(1), "[0]")}};
+    }
+  }
+  case Strategy::Kind::Iterator:
+  case Strategy::Kind::Array:
+  case Strategy::Kind::Vector:
+    llvm_unreachable("Strategy not implemented yet!");
+  case Strategy::Kind::Wontfix:
+    llvm_unreachable("Invalid strategy!");
+  }
+
+  return std::nullopt;
+}
+
 // For a non-null initializer `Init` of `T *` type, this function returns
 // `FixItHint`s producing a list initializer `{Init,  S}` as a part of a fix-it
 // to output stream.
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pointer-deref.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pointer-deref.cpp
new file mode 100644
index 0000000000000..4a02bbdf71182
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-pointer-deref.cpp
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+
+void basic_dereference() {
+  int tmp;
+  auto p = new int[10];
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  tmp = p[5];
+  int val = *p;
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:13-[[@LINE-1]]:14}:""
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:15-[[@LINE-2]]:15}:"[0]"
+}
+
+int return_method() {
+  auto p = new int[10];
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+  int tmp = p[5];
+  return *p;
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:11}:""
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"[0]"
+}
+
+void foo(int v) {
+}
+
+void method_invocation() {
+  auto p = new int[10];
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+
+  int tmp = p[5];
+
+  foo(*p);
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:8}:""
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:9-[[@LINE-2]]:9}:"[0]"
+}
+
+void binary_operation() {
+  auto p = new int[10];
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:11}:"std::span<int> p"
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:12-[[@LINE-2]]:12}:"{"
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-3]]:23-[[@LINE-3]]:23}:", 10}"
+
+  int tmp = p[5];
+
+  int k = *p + 20;
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:12}:""
+  // CHECK-DAG: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"[0]"
+
+}
+

From 909e5ce47a70181dead332826e93f89b2928f0c0 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <kubak@google.com>
Date: Wed, 22 Mar 2023 19:04:09 -0400
Subject: [PATCH 049/208] [mlir][arith] Add `uitofp` support to WIE

This includes standard LIT tests and integration tests with the LLVM CPU
runner.

I plan to use this to implement `sitofp` in D146597.

Reviewed By: antiagainst

Differential Revision: https://reviews.llvm.org/D146606
---
 .../Arith/Transforms/EmulateWideInt.cpp       | 69 ++++++++++++++++-
 .../emulate-wide-int-canonicalization.mlir    | 14 ++++
 mlir/test/Dialect/Arith/emulate-wide-int.mlir | 56 ++++++++++++++
 .../test-wide-int-emulation-uitofp-i32.mlir   | 77 +++++++++++++++++++
 4 files changed, 214 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Dialect/Arith/emulate-wide-int-canonicalization.mlir
 create mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-uitofp-i32.mlir

diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
index db3ddab483b5a..83f01397c4490 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -906,6 +907,70 @@ struct ConvertShRSI final : OpConversionPattern<arith::ShRSIOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// ConvertUIToFP
+//===----------------------------------------------------------------------===//
+
+struct ConvertUIToFP final : OpConversionPattern<arith::UIToFPOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::UIToFPOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    Type oldTy = op.getIn().getType();
+    auto newTy =
+        dyn_cast_or_null<VectorType>(getTypeConverter()->convertType(oldTy));
+    if (!newTy)
+      return rewriter.notifyMatchFailure(
+          loc, llvm::formatv("unsupported type: {0}", oldTy));
+    unsigned newBitWidth = newTy.getElementTypeBitWidth();
+
+    auto [low, hi] = extractLastDimHalves(rewriter, loc, adaptor.getIn());
+    Value lowInt = dropTrailingX1Dim(rewriter, loc, low);
+    Value hiInt = dropTrailingX1Dim(rewriter, loc, hi);
+    Value zeroCst =
+        createScalarOrSplatConstant(rewriter, loc, hiInt.getType(), 0);
+
+    // The final result has the following form:
+    //   if (hi == 0) return uitofp(low)
+    //   else         return uitofp(low) + uitofp(hi) * 2^BW
+    //
+    // where `BW` is the bitwidth of the narrowed integer type. We emit a
+    // select to make it easier to fold-away the `hi` part calculation when it
+    // is known to be zero.
+    //
+    // Note 1: The emulation is precise only for input values that have exact
+    // integer representation in the result floating point type, and may lead
+    // loss of precision otherwise.
+    //
+    // Note 2: We do not strictly need the `hi == 0`, case, but it makes
+    // constant folding easier.
+    Value hiEqZero = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::eq, hiInt, zeroCst);
+
+    Type resultTy = op.getType();
+    Type resultElemTy = getElementTypeOrSelf(resultTy);
+    Value lowFp = rewriter.create<arith::UIToFPOp>(loc, resultTy, lowInt);
+    Value hiFp = rewriter.create<arith::UIToFPOp>(loc, resultTy, hiInt);
+
+    int64_t pow2Int = int64_t(1) << newBitWidth;
+    Attribute pow2Attr =
+        rewriter.getFloatAttr(resultElemTy, static_cast<double>(pow2Int));
+    if (auto vecTy = dyn_cast<VectorType>(resultTy))
+      pow2Attr = SplatElementsAttr::get(vecTy, pow2Attr);
+
+    Value pow2Val = rewriter.create<arith::ConstantOp>(loc, resultTy, pow2Attr);
+
+    Value hiVal = rewriter.create<arith::MulFOp>(loc, hiFp, pow2Val);
+    Value result = rewriter.create<arith::AddFOp>(loc, lowFp, hiVal);
+
+    rewriter.replaceOpWithNewOp<arith::SelectOp>(op, hiEqZero, lowFp, result);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // ConvertTruncI
 //===----------------------------------------------------------------------===//
@@ -1080,6 +1145,6 @@ void arith::populateArithWideIntEmulationPatterns(
       ConvertIndexCastIntToIndex<arith::IndexCastOp>,
       ConvertIndexCastIntToIndex<arith::IndexCastUIOp>,
       ConvertIndexCastIndexToInt<arith::IndexCastOp, arith::ExtSIOp>,
-      ConvertIndexCastIndexToInt<arith::IndexCastUIOp, arith::ExtUIOp>>(
-      typeConverter, patterns.getContext());
+      ConvertIndexCastIndexToInt<arith::IndexCastUIOp, arith::ExtUIOp>,
+      ConvertUIToFP>(typeConverter, patterns.getContext());
 }
diff --git a/mlir/test/Dialect/Arith/emulate-wide-int-canonicalization.mlir b/mlir/test/Dialect/Arith/emulate-wide-int-canonicalization.mlir
new file mode 100644
index 0000000000000..0c95ab8284afa
--- /dev/null
+++ b/mlir/test/Dialect/Arith/emulate-wide-int-canonicalization.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-opt --arith-emulate-wide-int="widest-int-supported=32" --canonicalize %s | FileCheck %s
+
+// Check that we can fold away the 'hi' part calculation when it is know to be zero.
+//
+// CHECK-LABEL: func @uitofp_i16_ext_f64
+// CHECK-SAME:    ([[ARG:%.+]]: i16) -> f64
+// CHECK-NEXT:    [[EXT:%.+]] = arith.extui [[ARG]] : i16 to i32
+// CHECK-NEXT:    [[FP:%.+]]  = arith.uitofp [[EXT]] : i32 to f64
+// CHECK-NEXT:    return [[FP]] : f64
+func.func @uitofp_i16_ext_f64(%a : i16) -> f64 {
+  %ext = arith.extui %a : i16 to i64
+  %r = arith.uitofp %ext : i64 to f64
+  return %r : f64
+}
diff --git a/mlir/test/Dialect/Arith/emulate-wide-int.mlir b/mlir/test/Dialect/Arith/emulate-wide-int.mlir
index 80edc6f2ad001..55b4e7f89b0ac 100644
--- a/mlir/test/Dialect/Arith/emulate-wide-int.mlir
+++ b/mlir/test/Dialect/Arith/emulate-wide-int.mlir
@@ -908,3 +908,59 @@ func.func @xori_vector_a_b(%a : vector<3xi64>, %b : vector<3xi64>) -> vector<3xi
     %x = arith.xori %a, %b : vector<3xi64>
     return %x : vector<3xi64>
 }
+
+// CHECK-LABEL: func @uitofp_i64_f64
+// CHECK-SAME:    ([[ARG:%.+]]: vector<2xi32>) -> f64
+// CHECK-NEXT:    [[LOW:%.+]]    = vector.extract [[ARG]][0] : vector<2xi32>
+// CHECK-NEXT:    [[HI:%.+]]     = vector.extract [[ARG]][1] : vector<2xi32>
+// CHECK-NEXT:    [[CST0:%.+]]   = arith.constant 0 : i32
+// CHECK-NEXT:    [[HIEQ0:%.+]]  = arith.cmpi eq, [[HI]], [[CST0]] : i32
+// CHECK-NEXT:    [[LOWFP:%.+]]  = arith.uitofp [[LOW]] : i32 to f64
+// CHECK-NEXT:    [[HIFP:%.+]]   = arith.uitofp [[HI]] : i32 to f64
+// CHECK-NEXT:    [[POW:%.+]]    = arith.constant 0x41F0000000000000 : f64
+// CHECK-NEXT:    [[RESHI:%.+]]  = arith.mulf [[HIFP]], [[POW]] : f64
+// CHECK-NEXT:    [[RES:%.+]]    = arith.addf [[LOWFP]], [[RESHI]] : f64
+// CHECK-NEXT:    [[SEL:%.+]]    = arith.select [[HIEQ0]], [[LOWFP]], [[RES]] : f64
+// CHECK-NEXT:    return [[SEL]] : f64
+func.func @uitofp_i64_f64(%a : i64) -> f64 {
+    %r = arith.uitofp %a : i64 to f64
+    return %r : f64
+}
+
+// CHECK-LABEL: func @uitofp_i64_f64_vector
+// CHECK-SAME:    ([[ARG:%.+]]: vector<3x2xi32>) -> vector<3xf64>
+// CHECK-NEXT:    [[EXTLOW:%.+]] = vector.extract_strided_slice [[ARG]] {offsets = [0, 0], sizes = [3, 1], strides = [1, 1]} : vector<3x2xi32> to vector<3x1xi32>
+// CHECK-NEXT:    [[EXTHI:%.+]]  = vector.extract_strided_slice [[ARG]] {offsets = [0, 1], sizes = [3, 1], strides = [1, 1]} : vector<3x2xi32> to vector<3x1xi32>
+// CHECK-NEXT:    [[LOW:%.+]]    = vector.shape_cast [[EXTLOW]] : vector<3x1xi32> to vector<3xi32>
+// CHECK-NEXT:    [[HI:%.+]]     = vector.shape_cast [[EXTHI]] : vector<3x1xi32> to vector<3xi32>
+// CHECK-NEXT:    [[CST0:%.+]]   = arith.constant dense<0> : vector<3xi32>
+// CHECK-NEXT:    [[HIEQ0:%.+]]  = arith.cmpi eq, [[HI]], [[CST0]] : vector<3xi32>
+// CHECK-NEXT:    [[LOWFP:%.+]]  = arith.uitofp [[LOW]] : vector<3xi32> to vector<3xf64>
+// CHECK-NEXT:    [[HIFP:%.+]]   = arith.uitofp [[HI]] : vector<3xi32> to vector<3xf64>
+// CHECK-NEXT:    [[POW:%.+]]    = arith.constant dense<0x41F0000000000000> : vector<3xf64>
+// CHECK-NEXT:    [[RESHI:%.+]]  = arith.mulf [[HIFP]], [[POW]] : vector<3xf64>
+// CHECK-NEXT:    [[RES:%.+]]    = arith.addf [[LOWFP]], [[RESHI]] : vector<3xf64>
+// CHECK-NEXT:    [[SEL:%.+]]    = arith.select [[HIEQ0]], [[LOWFP]], [[RES]] : vector<3xi1>, vector<3xf64>
+// CHECK-NEXT:    return [[SEL]] : vector<3xf64>
+func.func @uitofp_i64_f64_vector(%a : vector<3xi64>) -> vector<3xf64> {
+    %r = arith.uitofp %a : vector<3xi64> to vector<3xf64>
+    return %r : vector<3xf64>
+}
+
+// CHECK-LABEL: func @uitofp_i64_f16
+// CHECK-SAME:    ([[ARG:%.+]]: vector<2xi32>) -> f16
+// CHECK-NEXT:    [[LOW:%.+]]   = vector.extract [[ARG]][0] : vector<2xi32>
+// CHECK-NEXT:    [[HI:%.+]]    = vector.extract [[ARG]][1] : vector<2xi32>
+// CHECK-NEXT:    [[CST0:%.+]]   = arith.constant 0 : i32
+// CHECK-NEXT:    [[HIEQ0:%.+]]  = arith.cmpi eq, [[HI]], [[CST0]] : i32
+// CHECK-NEXT:    [[LOWFP:%.+]]  = arith.uitofp [[LOW]] : i32 to f16
+// CHECK-NEXT:    [[HIFP:%.+]]   = arith.uitofp [[HI]] : i32 to f16
+// CHECK-NEXT:    [[POW:%.+]]    = arith.constant 0x7C00 : f16
+// CHECK-NEXT:    [[RESHI:%.+]]  = arith.mulf [[HIFP]], [[POW]] : f16
+// CHECK-NEXT:    [[RES:%.+]]    = arith.addf [[LOWFP]], [[RESHI]] : f16
+// CHECK-NEXT:    [[SEL:%.+]]    = arith.select [[HIEQ0]], [[LOWFP]], [[RES]] : f16
+// CHECK-NEXT:    return [[SEL]] : f16
+func.func @uitofp_i64_f16(%a : i64) -> f16 {
+    %r = arith.uitofp %a : i64 to f16
+    return %r : f16
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-uitofp-i32.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-uitofp-i32.mlir
new file mode 100644
index 0000000000000..c3d7db0de6d20
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-uitofp-i32.mlir
@@ -0,0 +1,77 @@
+// Check that the wide integer `arith.uitofp` emulation produces the same result as wide
+// `arith.uitofp`. Emulate i32 ops with i16 ops.
+
+// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
+// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
+// RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:                   --shared-libs=%mlir_c_runner_utils | \
+// RUN:   FileCheck %s --match-full-lines
+
+// RUN: mlir-opt %s --test-arith-emulate-wide-int="widest-int-supported=16" \
+// RUN:             --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
+// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
+// RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:                   --shared-libs=%mlir_c_runner_utils | \
+// RUN:   FileCheck %s --match-full-lines
+
+// Ops in this function *only* will be emulated using i16 types.
+func.func @emulate_uitofp(%arg: i32) -> f32 {
+  %res = arith.uitofp %arg : i32 to f32
+  return %res : f32
+}
+
+func.func @check_uitofp(%arg : i32) -> () {
+  %res = func.call @emulate_uitofp(%arg) : (i32) -> (f32)
+  vector.print %res : f32
+  return
+}
+
+func.func @entry() {
+  %cst0 = arith.constant 0 : i32
+  %cst1 = arith.constant 1 : i32
+  %cst2 = arith.constant 2 : i32
+  %cst7 = arith.constant 7 : i32
+  %cst1337 = arith.constant 1337 : i32
+  %cst_i16_max = arith.constant 65535 : i32
+  %cst_i16_overflow = arith.constant 65536 : i32
+
+  %cst_n1 = arith.constant -1 : i32
+  %cst_n13 = arith.constant -13 : i32
+  %cst_n1337 = arith.constant -1337 : i32
+
+  %cst_i16_min = arith.constant -32768 : i32
+
+  %cst_f32_int_max = arith.constant 16777217 : i32
+  %cst_f32_int_min = arith.constant -16777217 : i32
+
+  // CHECK:      0
+  func.call @check_uitofp(%cst0) : (i32) -> ()
+  // CHECK-NEXT: 1
+  func.call @check_uitofp(%cst1) : (i32) -> ()
+  // CHECK-NEXT: 2
+  func.call @check_uitofp(%cst2) : (i32) -> ()
+  // CHECK-NEXT: 7
+  func.call @check_uitofp(%cst7) : (i32) -> ()
+  // CHECK-NEXT: 1337
+  func.call @check_uitofp(%cst1337) : (i32) -> ()
+  // CHECK-NEXT: 65535
+  func.call @check_uitofp(%cst_i16_max) : (i32) -> ()
+  // CHECK-NEXT: 65536
+  func.call @check_uitofp(%cst_i16_overflow) : (i32) -> ()
+
+  // CHECK-NEXT: 4.2{{.+}}e+09
+  func.call @check_uitofp(%cst_n1) : (i32) -> ()
+  // CHECK-NEXT: 4.2{{.+}}e+09
+  func.call @check_uitofp(%cst_n1337) : (i32) -> ()
+
+  // CHECK-NEXT: 4.2{{.+}}e+09
+  func.call @check_uitofp(%cst_i16_min) : (i32) -> ()
+  // CHECK-NEXT: 4.2{{.+}}e+09
+  func.call @check_uitofp(%cst_i16_min) : (i32) -> ()
+  // CHECK-NEXT: 1.6{{.+}}e+07
+  func.call @check_uitofp(%cst_f32_int_max) : (i32) -> ()
+  // CHECK-NEXT: 4.2{{.+}}e+09
+  func.call @check_uitofp(%cst_f32_int_min) : (i32) -> ()
+
+  return
+}

From abfc358cff0c0cfc8ffbc6c164d97e13a18a1685 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <kubak@google.com>
Date: Wed, 22 Mar 2023 19:09:48 -0400
Subject: [PATCH 050/208] [mlir][arith] Add `sitofp` support to WIE

This depends on the handling of `uitofp` in D146606.

Reviewed By: antiagainst

Differential Revision: https://reviews.llvm.org/D146597
---
 .../Arith/Transforms/EmulateWideInt.cpp       | 49 ++++++++++++-
 mlir/test/Dialect/Arith/emulate-wide-int.mlir | 43 ++++++++++++
 .../test-wide-int-emulation-sitofp-i32.mlir   | 68 +++++++++++++++++++
 3 files changed, 159 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-sitofp-i32.mlir

diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
index 83f01397c4490..781ea3d3eca63 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
@@ -16,6 +16,7 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
@@ -907,6 +908,52 @@ struct ConvertShRSI final : OpConversionPattern<arith::ShRSIOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// ConvertSIToFP
+//===----------------------------------------------------------------------===//
+
+struct ConvertSIToFP final : OpConversionPattern<arith::SIToFPOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::SIToFPOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    Value in = op.getIn();
+    Type oldTy = in.getType();
+    auto newTy =
+        dyn_cast_or_null<VectorType>(getTypeConverter()->convertType(oldTy));
+    if (!newTy)
+      return rewriter.notifyMatchFailure(
+          loc, llvm::formatv("unsupported type: {0}", oldTy));
+
+    unsigned oldBitWidth = getElementTypeOrSelf(oldTy).getIntOrFloatBitWidth();
+    Value zeroCst = createScalarOrSplatConstant(rewriter, loc, oldTy, 0);
+    Value oneCst = createScalarOrSplatConstant(rewriter, loc, oldTy, 1);
+    Value allOnesCst = createScalarOrSplatConstant(
+        rewriter, loc, oldTy, APInt::getAllOnes(oldBitWidth));
+
+    // To avoid operating on very large unsigned numbers, perform the
+    // conversion on the absolute value. Then, decide whether to negate the
+    // result or not based on that sign bit. We assume two's complement and
+    // implement negation by flipping all bits and adding 1.
+    // Note that this relies on the the other conversion patterns to legalize
+    // created ops and narrow the bit widths.
+    Value isNeg = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
+                                                 in, zeroCst);
+    Value bitwiseNeg = rewriter.create<arith::XOrIOp>(loc, in, allOnesCst);
+    Value neg = rewriter.create<arith::AddIOp>(loc, bitwiseNeg, oneCst);
+    Value abs = rewriter.create<arith::SelectOp>(loc, isNeg, neg, in);
+
+    Value absResult = rewriter.create<arith::UIToFPOp>(loc, op.getType(), abs);
+    Value negResult = rewriter.create<arith::NegFOp>(loc, absResult);
+    rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNeg, negResult,
+                                                 absResult);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // ConvertUIToFP
 //===----------------------------------------------------------------------===//
@@ -1146,5 +1193,5 @@ void arith::populateArithWideIntEmulationPatterns(
       ConvertIndexCastIntToIndex<arith::IndexCastUIOp>,
       ConvertIndexCastIndexToInt<arith::IndexCastOp, arith::ExtSIOp>,
       ConvertIndexCastIndexToInt<arith::IndexCastUIOp, arith::ExtUIOp>,
-      ConvertUIToFP>(typeConverter, patterns.getContext());
+      ConvertSIToFP, ConvertUIToFP>(typeConverter, patterns.getContext());
 }
diff --git a/mlir/test/Dialect/Arith/emulate-wide-int.mlir b/mlir/test/Dialect/Arith/emulate-wide-int.mlir
index 55b4e7f89b0ac..9fb5478d7e94f 100644
--- a/mlir/test/Dialect/Arith/emulate-wide-int.mlir
+++ b/mlir/test/Dialect/Arith/emulate-wide-int.mlir
@@ -964,3 +964,46 @@ func.func @uitofp_i64_f16(%a : i64) -> f16 {
     %r = arith.uitofp %a : i64 to f16
     return %r : f16
 }
+
+// CHECK-LABEL: func @sitofp_i64_f64
+// CHECK-SAME:    ([[ARG:%.+]]: vector<2xi32>) -> f64
+// CHECK:         [[VONES:%.+]]  = arith.constant dense<-1> : vector<2xi32>
+// CHECK:         [[ONES1:%.+]]  = vector.extract [[VONES]][0] : vector<2xi32>
+// CHECK-NEXT:    [[ONES2:%.+]]  = vector.extract [[VONES]][1] : vector<2xi32>
+// CHECK:                          arith.xori {{%.+}}, [[ONES1]] : i32
+// CHECK-NEXT:                     arith.xori {{%.+}}, [[ONES2]] : i32
+// CHECK:         [[CST0:%.+]]   = arith.constant 0 : i32
+// CHECK:         [[HIEQ0:%.+]]  = arith.cmpi eq, [[HI:%.+]], [[CST0]] : i32
+// CHECK-NEXT:    [[LOWFP:%.+]]  = arith.uitofp [[LOW:%.+]] : i32 to f64
+// CHECK-NEXT:    [[HIFP:%.+]]   = arith.uitofp [[HI]] : i32 to f64
+// CHECK-NEXT:    [[POW:%.+]]    = arith.constant 0x41F0000000000000 : f64
+// CHECK-NEXT:    [[RESHI:%.+]]  = arith.mulf [[HIFP]], [[POW]] : f64
+// CHECK-NEXT:    [[RES:%.+]]    = arith.addf [[LOWFP]], [[RESHI]] : f64
+// CHECK-NEXT:    [[SEL:%.+]]    = arith.select [[HIEQ0]], [[LOWFP]], [[RES]] : f64
+// CHECK-NEXT:    [[NEG:%.+]]    = arith.negf [[SEL]] : f64
+// CHECK-NEXT:    [[FINAL:%.+]]  = arith.select %{{.+}}, [[NEG]], [[SEL]] : f64
+// CHECK-NEXT:    return [[FINAL]] : f64
+func.func @sitofp_i64_f64(%a : i64) -> f64 {
+    %r = arith.sitofp %a : i64 to f64
+    return %r : f64
+}
+
+// CHECK-LABEL: func @sitofp_i64_f64_vector
+// CHECK-SAME:    ([[ARG:%.+]]: vector<3x2xi32>) -> vector<3xf64>
+// CHECK:         [[VONES:%.+]]  = arith.constant dense<-1> : vector<3x2xi32>
+// CHECK:                          arith.xori
+// CHECK-NEXT:                     arith.xori
+// CHECK:         [[HIEQ0:%.+]]  = arith.cmpi eq, [[HI:%.+]], [[CST0:%.+]] : vector<3xi32>
+// CHECK-NEXT:    [[LOWFP:%.+]]  = arith.uitofp [[LOW:%.+]] : vector<3xi32> to vector<3xf64>
+// CHECK-NEXT:    [[HIFP:%.+]]   = arith.uitofp [[HI:%.+]] : vector<3xi32> to vector<3xf64>
+// CHECK-NEXT:    [[POW:%.+]]    = arith.constant dense<0x41F0000000000000> : vector<3xf64>
+// CHECK-NEXT:    [[RESHI:%.+]]  = arith.mulf [[HIFP]], [[POW]] : vector<3xf64>
+// CHECK-NEXT:    [[RES:%.+]]    = arith.addf [[LOWFP]], [[RESHI]] : vector<3xf64>
+// CHECK-NEXT:    [[SEL:%.+]]    = arith.select [[HIEQ0]], [[LOWFP]], [[RES]] : vector<3xi1>, vector<3xf64>
+// CHECK-NEXT:    [[NEG:%.+]]    = arith.negf [[SEL]] : vector<3xf64>
+// CHECK-NEXT:    [[FINAL:%.+]]  = arith.select %{{.+}}, [[NEG]], [[SEL]] : vector<3xi1>, vector<3xf64>
+// CHECK-NEXT:    return [[FINAL]] : vector<3xf64>
+func.func @sitofp_i64_f64_vector(%a : vector<3xi64>) -> vector<3xf64> {
+    %r = arith.sitofp %a : vector<3xi64> to vector<3xf64>
+    return %r : vector<3xf64>
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-sitofp-i32.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-sitofp-i32.mlir
new file mode 100644
index 0000000000000..3fc008705f111
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-sitofp-i32.mlir
@@ -0,0 +1,68 @@
+// Check that the wide integer `arith.sitofp` emulation produces the same result as wide
+// `arith.sitofp`. Emulate i32 ops with i16 ops.
+
+// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
+// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
+// RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:                   --shared-libs=%mlir_c_runner_utils | \
+// RUN:   FileCheck %s --match-full-lines
+
+// RUN: mlir-opt %s --test-arith-emulate-wide-int="widest-int-supported=16" \
+// RUN:             --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
+// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
+// RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:                   --shared-libs=%mlir_c_runner_utils | \
+// RUN:   FileCheck %s --match-full-lines
+
+// Ops in this function *only* will be emulated using i16 types.
+func.func @emulate_sitofp(%arg: i32) -> f32 {
+  %res = arith.sitofp %arg : i32 to f32
+  return %res : f32
+}
+
+func.func @check_sitofp(%arg : i32) -> () {
+  %res = func.call @emulate_sitofp(%arg) : (i32) -> (f32)
+  vector.print %res : f32
+  return
+}
+
+func.func @entry() {
+  %cst0 = arith.constant 0 : i32
+  %cst1 = arith.constant 1 : i32
+  %cst2 = arith.constant 2 : i32
+  %cst7 = arith.constant 7 : i32
+  %cst1337 = arith.constant 1337 : i32
+
+  %cst_n1 = arith.constant -1 : i32
+  %cst_n13 = arith.constant -13 : i32
+  %cst_n1337 = arith.constant -1337 : i32
+
+  %cst_i16_min = arith.constant -32768 : i32
+
+  %cst_f32_int_max = arith.constant 16777217 : i32
+  %cst_f32_int_min = arith.constant -16777217 : i32
+
+  // CHECK:      0
+  func.call @check_sitofp(%cst0) : (i32) -> ()
+  // CHECK-NEXT: 1
+  func.call @check_sitofp(%cst1) : (i32) -> ()
+  // CHECK-NEXT: 2
+  func.call @check_sitofp(%cst2) : (i32) -> ()
+  // CHECK-NEXT: 7
+  func.call @check_sitofp(%cst7) : (i32) -> ()
+  // CHECK-NEXT: 1337
+  func.call @check_sitofp(%cst1337) : (i32) -> ()
+  // CHECK-NEXT: -1
+  func.call @check_sitofp(%cst_n1) : (i32) -> ()
+  // CHECK-NEXT: -1337
+  func.call @check_sitofp(%cst_n1337) : (i32) -> ()
+
+  // CHECK-NEXT: -32768
+  func.call @check_sitofp(%cst_i16_min) : (i32) -> ()
+  // CHECK-NEXT: 1.6{{.+}}e+07
+  func.call @check_sitofp(%cst_f32_int_max) : (i32) -> ()
+  // CHECK-NEXT: -1.6{{.+}}e+07
+  func.call @check_sitofp(%cst_f32_int_min) : (i32) -> ()
+
+  return
+}

From c81f14e5898c37c13e5b22485cf37cc124caf0c1 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <kubak@google.com>
Date: Wed, 22 Mar 2023 19:15:12 -0400
Subject: [PATCH 051/208] [mlir][arith] Fix typos in WIE. NFC.

---
 .../Dialect/Arith/Transforms/WideIntEmulationConverter.h  | 2 +-
 mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/WideIntEmulationConverter.h b/mlir/include/mlir/Dialect/Arith/Transforms/WideIntEmulationConverter.h
index ea0ab14e9b8f1..5dbbfedcc70ee 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/WideIntEmulationConverter.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/WideIntEmulationConverter.h
@@ -16,7 +16,7 @@ namespace mlir::arith {
 /// two halves and thus turning into supported ones, i.e., i2*N --> iN, where N
 /// is the widest integer bitwidth supported by the target.
 /// Currently, we only handle power-of-two integer types and support conversions
-/// of integers twice as wide as the maxium supported by the target. Wide
+/// of integers twice as wide as the maximum supported by the target. Wide
 /// integers are represented as vectors, e.g., i64 --> vector<2xi32>, where the
 /// first element is the low half of the original integer, and the second
 /// element the high half.
diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
index 781ea3d3eca63..96a58459a37b9 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
@@ -43,7 +43,7 @@ static std::pair<APInt, APInt> getHalves(const APInt &value,
   return {std::move(low), std::move(high)};
 }
 
-/// Returns the type with the last (innermost) dimention reduced to x1.
+/// Returns the type with the last (innermost) dimension reduced to x1.
 /// Scalarizes 1D vector inputs to match how we extract/insert vector values,
 /// e.g.:
 ///   - vector<3x2xi16> --> vector<3x1xi16>
@@ -128,7 +128,7 @@ static Value dropTrailingX1Dim(ConversionPatternRewriter &rewriter,
   if (!vecTy)
     return input;
 
-  // Shape cast to drop the last x1 dimention.
+  // Shape cast to drop the last x1 dimension.
   ArrayRef<int64_t> shape = vecTy.getShape();
   assert(shape.size() >= 2 && "Expected vector with at list two dims");
   assert(shape.back() == 1 && "Expected the last vector dim to be x1");
@@ -177,13 +177,13 @@ static Value insertLastDimSlice(ConversionPatternRewriter &rewriter,
 /// dimension.
 /// When all `resultComponents` are scalars, the result type is `vector<NxT>`;
 /// when `resultComponents` are `vector<...x1xT>`s, the result type is
-/// `vector<...xNxT>`, where `N` is the number of `resultComponenets`.
+/// `vector<...xNxT>`, where `N` is the number of `resultComponents`.
 static Value constructResultVector(ConversionPatternRewriter &rewriter,
                                    Location loc, VectorType resultType,
                                    ValueRange resultComponents) {
   llvm::ArrayRef<int64_t> resultShape = resultType.getShape();
   (void)resultShape;
-  assert(!resultShape.empty() && "Result expected to have dimentions");
+  assert(!resultShape.empty() && "Result expected to have dimensions");
   assert(resultShape.back() == static_cast<int64_t>(resultComponents.size()) &&
          "Wrong number of result components");
 

From 0c0387c7a5e979d2dbf791404c7398856895f8fb Mon Sep 17 00:00:00 2001
From: Pavel Kopyl <pavelkopyl@gmail.com>
Date: Thu, 16 Mar 2023 21:53:14 +0100
Subject: [PATCH 052/208] [NVPTX] Port GenericToNVVM to the new PM.

Differential Revision: https://reviews.llvm.org/D146345
---
 llvm/lib/Target/NVPTX/NVPTX.h                 |  6 ++-
 llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp  | 52 ++++++++++++-------
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  | 16 ++++--
 llvm/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll |  1 +
 4 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 95184420f6087..521a7843b1142 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -38,7 +38,7 @@ enum CondCodes {
 FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
                                  llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
-ModulePass *createGenericToNVVMPass();
+ModulePass *createGenericToNVVMLegacyPass();
 FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
 FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
@@ -67,6 +67,10 @@ struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
   unsigned SmVersion;
 };
 
+struct GenericToNVVMPass : PassInfoMixin<GenericToNVVMPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 namespace NVPTX {
 enum DrvInterface {
   NVCL,
diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index d892023c6cb7f..4f03e474edb47 100644
--- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -29,19 +29,13 @@
 using namespace llvm;
 
 namespace llvm {
-void initializeGenericToNVVMPass(PassRegistry &);
+void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
 }
 
 namespace {
-class GenericToNVVM : public ModulePass {
+class GenericToNVVM {
 public:
-  static char ID;
-
-  GenericToNVVM() : ModulePass(ID) {}
-
-  bool runOnModule(Module &M) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+  bool runOnModule(Module &M);
 
 private:
   Value *remapConstant(Module *M, Function *F, Constant *C,
@@ -59,15 +53,6 @@ class GenericToNVVM : public ModulePass {
 };
 } // end namespace
 
-char GenericToNVVM::ID = 0;
-
-ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); }
-
-INITIALIZE_PASS(
-    GenericToNVVM, "generic-to-nvvm",
-    "Ensure that the global variables are in the global address space", false,
-    false)
-
 bool GenericToNVVM::runOnModule(Module &M) {
   // Create a clone of each global variable that has the default address space.
   // The clone is created with the global address space  specifier, and the pair
@@ -293,3 +278,34 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
     llvm_unreachable("GenericToNVVM encountered an unsupported ConstantExpr");
   }
 }
+
+namespace {
+class GenericToNVVMLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  GenericToNVVMLegacyPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+};
+} // namespace
+
+char GenericToNVVMLegacyPass::ID = 0;
+
+ModulePass *llvm::createGenericToNVVMLegacyPass() {
+  return new GenericToNVVMLegacyPass();
+}
+
+INITIALIZE_PASS(
+    GenericToNVVMLegacyPass, "generic-to-nvvm",
+    "Ensure that the global variables are in the global address space", false,
+    false)
+
+bool GenericToNVVMLegacyPass::runOnModule(Module &M) {
+  return GenericToNVVM().runOnModule(M);
+}
+
+PreservedAnalyses GenericToNVVMPass::run(Module &M, ModuleAnalysisManager &AM) {
+  return GenericToNVVM().runOnModule(M) ? PreservedAnalyses::none()
+                                        : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 5ff9b4df6d7f9..3d1e4fcde90aa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -64,7 +64,7 @@ static cl::opt<bool> UseShortPointersOpt(
 
 namespace llvm {
 
-void initializeGenericToNVVMPass(PassRegistry&);
+void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
 void initializeNVPTXAtomicLowerPass(PassRegistry &);
@@ -89,7 +89,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   // but it's very NVPTX-specific.
   initializeNVVMReflectPass(PR);
   initializeNVVMIntrRangePass(PR);
-  initializeGenericToNVVMPass(PR);
+  initializeGenericToNVVMLegacyPassPass(PR);
   initializeNVPTXAllocaHoistingPass(PR);
   initializeNVPTXAssignValidGlobalNamesPass(PR);
   initializeNVPTXAtomicLowerPass(PR);
@@ -246,6 +246,16 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
     return false;
   });
 
+  PB.registerPipelineParsingCallback(
+      [](StringRef PassName, ModulePassManager &PM,
+         ArrayRef<PassBuilder::PipelineElement>) {
+        if (PassName == "generic-to-nvvm") {
+          PM.addPass(GenericToNVVMPass());
+          return true;
+        }
+        return false;
+      });
+
   PB.registerPipelineStartEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
@@ -348,7 +358,7 @@ void NVPTXPassConfig::addIRPasses() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createNVPTXImageOptimizerPass());
   addPass(createNVPTXAssignValidGlobalNamesPass());
-  addPass(createGenericToNVVMPass());
+  addPass(createGenericToNVVMLegacyPass());
 
   // NVPTXLowerArgs is required for correctness and should be run right
   // before the address space inference passes.
diff --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
index 51344b474d29e..daed7c1c98f0b 100644
--- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
+++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
@@ -1,6 +1,7 @@
 ; Verify functionality of NVPTXGenericToNVVM.cpp pass.
 ;
 ; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm | FileCheck %s
+; RUN: opt < %s -march nvptx64 -S -passes='generic-to-nvvm' | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

From fd47ab05e5abd98254d2bba012d81dbb00217812 Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Wed, 22 Mar 2023 15:55:18 -0700
Subject: [PATCH 053/208] Add "REQUIRES: asserts" to test that uses
 --debug-only flag

---
 bolt/test/X86/section-end-sym.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/test/X86/section-end-sym.s b/bolt/test/X86/section-end-sym.s
index a9bca5604ec16..38517bf7e0719 100644
--- a/bolt/test/X86/section-end-sym.s
+++ b/bolt/test/X86/section-end-sym.s
@@ -1,7 +1,7 @@
 ## Check that BOLT doesn't consider end-of-section symbols (e.g., _etext) as
 ## functions.
 
-# REQUIRES: system-linux
+# REQUIRES: system-linux, asserts
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
 # RUN: ld.lld %t.o -o %t.exe -q

From d859275e7701c10b7dfe8b2be27b8eae4d97a7bd Mon Sep 17 00:00:00 2001
From: Kai Sasaki <lewuathe@gmail.com>
Date: Thu, 23 Mar 2023 09:50:40 +0900
Subject: [PATCH 054/208] [mlir] Fix typo for unknown operation

Reviewed By: rriddle

Differential Revision: https://reviews.llvm.org/D146607
---
 mlir/lib/IR/AsmPrinter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index dd3112516fc51..75448955f3123 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -1369,7 +1369,7 @@ void SSANameState::printValueID(Value value, bool printResultNo,
 void SSANameState::printOperationID(Operation *op, raw_ostream &stream) const {
   auto it = operationIDs.find(op);
   if (it == operationIDs.end()) {
-    stream << "<<UNKOWN OPERATION>>";
+    stream << "<<UNKNOWN OPERATION>>";
   } else {
     stream << '%' << it->second;
   }

From 9855fe4568770947abf6c465c513dfd4a6c6dca6 Mon Sep 17 00:00:00 2001
From: Ben Shi <powerman1st@163.com>
Date: Tue, 14 Mar 2023 17:27:47 +0800
Subject: [PATCH 055/208] [RISCV][NFC] Add more tests for SLP vectorization
 (binops on load/store)

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D146025
---
 .../SLPVectorizer/RISCV/load-binop-store.ll   | 386 ++++++++++++++++++
 1 file changed, 386 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/load-binop-store.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/load-binop-store.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/load-binop-store.ll
new file mode 100644
index 0000000000000..92b0f83c84b9e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/load-binop-store.ll
@@ -0,0 +1,386 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
+; RUN: -riscv-v-vector-bits-min=-1 -riscv-v-slp-max-vf=0 -S | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v -S | FileCheck %s --check-prefixes=DEFAULT
+
+define void @vec_add(ptr %dest, ptr %p) {
+; CHECK-LABEL: @vec_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i16> [[TMP0]], <i16 1, i16 1>
+; CHECK-NEXT:    store <2 x i16> [[TMP1]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_add(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = add i16 [[E0]], 1
+; DEFAULT-NEXT:    [[A1:%.*]] = add i16 [[E1]], 1
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %a0 = add i16 %e0, 1
+  %a1 = add i16 %e1, 1
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+define void @vec_sub(ptr %dest, ptr %p) {
+; CHECK-LABEL: @vec_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i16> [[TMP0]], <i16 17, i16 17>
+; CHECK-NEXT:    store <2 x i16> [[TMP1]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_sub(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = sub i16 [[E0]], 17
+; DEFAULT-NEXT:    [[A1:%.*]] = sub i16 [[E1]], 17
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %a0 = sub i16 %e0, 17
+  %a1 = sub i16 %e1, 17
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+define void @vec_rsub(ptr %dest, ptr %p) {
+; CHECK-LABEL: @vec_rsub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i16> <i16 29, i16 29>, [[TMP0]]
+; CHECK-NEXT:    store <2 x i16> [[TMP1]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_rsub(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = sub i16 29, [[E0]]
+; DEFAULT-NEXT:    [[A1:%.*]] = sub i16 29, [[E1]]
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %a0 = sub i16 29, %e0
+  %a1 = sub i16 29, %e1
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+define void @vec_mul(ptr %dest, ptr %p) {
+; CHECK-LABEL: @vec_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i16> [[TMP0]], <i16 7, i16 7>
+; CHECK-NEXT:    store <2 x i16> [[TMP1]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_mul(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = mul i16 [[E0]], 7
+; DEFAULT-NEXT:    [[A1:%.*]] = mul i16 [[E1]], 7
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %a0 = mul i16 %e0, 7
+  %a1 = mul i16 %e1, 7
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+define void @vec_sdiv(ptr %dest, ptr %p) {
+; CHECK-LABEL: @vec_sdiv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <2 x i16> [[TMP0]], <i16 7, i16 7>
+; CHECK-NEXT:    store <2 x i16> [[TMP1]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_sdiv(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = sdiv i16 [[E0]], 7
+; DEFAULT-NEXT:    [[A1:%.*]] = sdiv i16 [[E1]], 7
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %a0 = sdiv i16 %e0, 7
+  %a1 = sdiv i16 %e1, 7
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+define void @vec_and(ptr %dest, ptr %p, ptr %q) {
+; CHECK-LABEL: @vec_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <2 x i16> [[TMP2]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_and(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[F0:%.*]] = load i16, ptr [[Q:%.*]], align 4
+; DEFAULT-NEXT:    [[INQ:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 1
+; DEFAULT-NEXT:    [[F1:%.*]] = load i16, ptr [[INQ]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = and i16 [[E0]], [[F0]]
+; DEFAULT-NEXT:    [[A1:%.*]] = and i16 [[E1]], [[F1]]
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %f0 = load i16, ptr %q, align 4
+  %inq = getelementptr inbounds i16, ptr %q, i64 1
+  %f1 = load i16, ptr %inq, align 2
+
+  %a0 = and i16 %e0, %f0
+  %a1 = and i16 %e1, %f1
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+define void @vec_or(ptr %dest, ptr %p, ptr %q) {
+; CHECK-LABEL: @vec_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <2 x i16> [[TMP2]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_or(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[F0:%.*]] = load i16, ptr [[Q:%.*]], align 4
+; DEFAULT-NEXT:    [[INQ:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 1
+; DEFAULT-NEXT:    [[F1:%.*]] = load i16, ptr [[INQ]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = or i16 [[E0]], [[F0]]
+; DEFAULT-NEXT:    [[A1:%.*]] = or i16 [[E1]], [[F1]]
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %f0 = load i16, ptr %q, align 4
+  %inq = getelementptr inbounds i16, ptr %q, i64 1
+  %f1 = load i16, ptr %inq, align 2
+
+  %a0 = or i16 %e0, %f0
+  %a1 = or i16 %e1, %f1
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+define void @vec_sll(ptr %dest, ptr %p, ptr %q) {
+; CHECK-LABEL: @vec_sll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <2 x i16> [[TMP2]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_sll(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[F0:%.*]] = load i16, ptr [[Q:%.*]], align 4
+; DEFAULT-NEXT:    [[INQ:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 1
+; DEFAULT-NEXT:    [[F1:%.*]] = load i16, ptr [[INQ]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = shl i16 [[E0]], [[F0]]
+; DEFAULT-NEXT:    [[A1:%.*]] = shl i16 [[E1]], [[F1]]
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %f0 = load i16, ptr %q, align 4
+  %inq = getelementptr inbounds i16, ptr %q, i64 1
+  %f1 = load i16, ptr %inq, align 2
+
+  %a0 = shl i16 %e0, %f0
+  %a1 = shl i16 %e1, %f1
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+declare i16 @llvm.smin.i16(i16, i16)
+define void @vec_smin(ptr %dest, ptr %p, ptr %q) {
+; CHECK-LABEL: @vec_smin(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; CHECK-NEXT:    store <2 x i16> [[TMP2]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_smin(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[F0:%.*]] = load i16, ptr [[Q:%.*]], align 4
+; DEFAULT-NEXT:    [[INQ:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 1
+; DEFAULT-NEXT:    [[F1:%.*]] = load i16, ptr [[INQ]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = tail call i16 @llvm.smin.i16(i16 [[E0]], i16 [[F0]])
+; DEFAULT-NEXT:    [[A1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[E1]], i16 [[F1]])
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %f0 = load i16, ptr %q, align 4
+  %inq = getelementptr inbounds i16, ptr %q, i64 1
+  %f1 = load i16, ptr %inq, align 2
+
+  %a0 = tail call i16 @llvm.smin.i16(i16 %e0, i16 %f0)
+  %a1 = tail call i16 @llvm.smin.i16(i16 %e1, i16 %f1)
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}
+
+declare i16 @llvm.umax.i16(i16, i16)
+define void @vec_umax(ptr %dest, ptr %p, ptr %q) {
+; CHECK-LABEL: @vec_umax(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; CHECK-NEXT:    store <2 x i16> [[TMP2]], ptr [[DEST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+; DEFAULT-LABEL: @vec_umax(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[E0:%.*]] = load i16, ptr [[P:%.*]], align 4
+; DEFAULT-NEXT:    [[INC:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; DEFAULT-NEXT:    [[E1:%.*]] = load i16, ptr [[INC]], align 2
+; DEFAULT-NEXT:    [[F0:%.*]] = load i16, ptr [[Q:%.*]], align 4
+; DEFAULT-NEXT:    [[INQ:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 1
+; DEFAULT-NEXT:    [[F1:%.*]] = load i16, ptr [[INQ]], align 2
+; DEFAULT-NEXT:    [[A0:%.*]] = tail call i16 @llvm.umax.i16(i16 [[E0]], i16 [[F0]])
+; DEFAULT-NEXT:    [[A1:%.*]] = tail call i16 @llvm.umax.i16(i16 [[E1]], i16 [[F1]])
+; DEFAULT-NEXT:    store i16 [[A0]], ptr [[DEST:%.*]], align 4
+; DEFAULT-NEXT:    [[INC2:%.*]] = getelementptr inbounds i16, ptr [[DEST]], i64 1
+; DEFAULT-NEXT:    store i16 [[A1]], ptr [[INC2]], align 2
+; DEFAULT-NEXT:    ret void
+;
+entry:
+  %e0 = load i16, ptr %p, align 4
+  %inc = getelementptr inbounds i16, ptr %p, i64 1
+  %e1 = load i16, ptr %inc, align 2
+
+  %f0 = load i16, ptr %q, align 4
+  %inq = getelementptr inbounds i16, ptr %q, i64 1
+  %f1 = load i16, ptr %inq, align 2
+
+  %a0 = tail call i16 @llvm.umax.i16(i16 %e0, i16 %f0)
+  %a1 = tail call i16 @llvm.umax.i16(i16 %e1, i16 %f1)
+
+  store i16 %a0, ptr %dest, align 4
+  %inc2 = getelementptr inbounds i16, ptr %dest, i64 1
+  store i16 %a1, ptr %inc2, align 2
+  ret void
+}

From ae63b1a5767b89fe5af140365f9e3ccf74feb1f0 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Wed, 22 Mar 2023 19:58:08 -0500
Subject: [PATCH 056/208] [libc] Adjust NVPTX startup code

Summary:
The startup code needs to include the environment pointer so we add this
to the arguments. Also we need to ensure that the `crt1.o` file is made
with `-fgpu-rdc` set so we can actually use it without undefined
reference errors.
---
 libc/startup/gpu/nvptx/CMakeLists.txt | 7 ++++++-
 libc/startup/gpu/nvptx/start.cpp      | 7 ++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index f7f58ec702bf2..96ab7540cedb1 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -8,6 +8,7 @@ add_startup_object(
     -nogpulib # Do not include any GPU vendor libraries.
     -nostdinc
     -x cuda # Use the CUDA toolchain to emit the `_start` kernel.
+    -fgpu-rdc # Emit relocatable device code from CUDA.
     --offload-device-only
     --offload-arch=${LIBC_GPU_TARGET_ARCHITECTURE}
   NO_GPU_BUNDLE # Compile this file directly without special GPU handling.
@@ -15,4 +16,8 @@ add_startup_object(
 get_fq_target_name(crt1 fq_name)
 
 # Ensure that clang uses the correct linker for this object type.
-target_link_libraries(${fq_name} PUBLIC "--target=${LIBC_GPU_TARGET_TRIPLE}")
+target_link_libraries(${fq_name}
+  PUBLIC
+  "-march=${LIBC_GPU_TARGET_ARCHITECTURE}"
+  "--target=${LIBC_GPU_TARGET_TRIPLE}"
+)
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 61569423c7b55..cf4077c3d9edd 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-extern "C" __attribute__((device)) int main(int argc, char **argv);
+extern "C" __attribute__((device)) int main(int argc, char **argv, char **envp);
 
 // TODO: We shouldn't need to use the CUDA language to emit a kernel for NVPTX.
 extern "C" [[gnu::visibility("protected")]] __attribute__((global)) void
-_start(int argc, char **argv, int *ret) {
-  __atomic_fetch_or(ret, main(argc, argv), __ATOMIC_RELAXED);
+_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
+       void *buffer) {
+  __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }

From 3d7383d7f44a4b9760f6566e62950968e90e0a1c Mon Sep 17 00:00:00 2001
From: Kai Sasaki <lewuathe@gmail.com>
Date: Thu, 23 Mar 2023 09:59:29 +0900
Subject: [PATCH 057/208] [mlir][affine] Prevent vectorizer test from crash
 without any map

If the vectorizer test pass does not get any affine map, it should output nothing instead of crash.

Issue: https://github.com/llvm/llvm-project/issues/61534

Reviewed By: nicolasvasilache, dcaballe

Differential Revision: https://reviews.llvm.org/D146601
---
 mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir | 8 ++++++++
 mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp   | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
index 3b7820cec67a2..b53fc55fdac91 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
@@ -159,3 +159,11 @@ func.func @multi_symbols() {
   "test_affine_map"() { affine_map = affine_map<(d0, d1)[s0, s1] -> (d0 + 1 + s1, d1 - 1 - s0)> } : () -> ()
   return
 }
+
+// -----
+
+// CHECK-LABEL: @no_affine_maps
+func.func @no_affine_maps() {
+  // CHECK: return
+  return
+}
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
index 61428bbf7091f..b31dd3f7d866f 100644
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -215,6 +215,9 @@ void VectorizerTestPass::testComposeMaps(llvm::raw_ostream &outs) {
                    .getValue();
     maps.push_back(map);
   }
+  if (maps.empty())
+    // Nothing to compose
+    return;
   AffineMap res;
   for (auto m : maps) {
     res = res ? res.compose(m) : m;

From 89a1af749166627704cdf9d676455d32616c2c06 Mon Sep 17 00:00:00 2001
From: Matthias Gehre <matthias.gehre@xilinx.com>
Date: Thu, 23 Mar 2023 01:08:43 +0000
Subject: [PATCH 058/208] [mlir][tosa] TosaToLinalg: Lower TOSA.Cast via
 RoundEven according to TOSA spec 0.60.0

TOSA now specifies rounding of ties to even in section 1.8.2., "Main Inference Profile"

Reviewed By: eric-k256, rsuderman

Differential Revision: https://reviews.llvm.org/D146617
---
 mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp    | 12 +-----------
 .../test/Conversion/TosaToLinalg/tosa-to-linalg.mlir |  7 +------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 271a09539e46e..be24f5ee5feb4 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -471,11 +471,6 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args,
     }
 
     if (arith::FPToSIOp::areCastCompatible(srcTy, dstTy)) {
-      auto zero = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getF32FloatAttr(0.0f));
-      auto half = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getF32FloatAttr(0.5f));
-
       auto intMin = rewriter.create<arith::ConstantOp>(
           loc, rewriter.getF32FloatAttr(
                    APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())
@@ -486,12 +481,7 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args,
                    APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
                        .getSExtValue()));
 
-      auto added = rewriter.create<arith::AddFOp>(loc, args[0], half);
-      auto subbed = rewriter.create<arith::SubFOp>(loc, args[0], half);
-      auto negative = rewriter.create<arith::CmpFOp>(
-          loc, arith::CmpFPredicate::OLT, args[0], zero);
-      auto rounded =
-          rewriter.create<arith::SelectOp>(loc, negative, subbed, added);
+      auto rounded = rewriter.create<math::RoundEvenOp>(loc, args[0]);
 
       auto clamped = clampFloatHelper(loc, rounded, intMin, intMax, rewriter);
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 133999eff1ec3..476131b262fb9 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -237,14 +237,9 @@ func.func @test_simple_f32(%arg0: tensor<1xf32>) -> () {
   %19 = "tosa.sigmoid"(%0) : (tensor<1xf32>) -> tensor<1xf32>
 
   // CHECK: linalg.generic
-  // CHECK: arith.constant 0.000000e+00
-  // CHECK: arith.constant 5.000000e-01
   // CHECK: arith.constant -2.14748365E+9
   // CHECK: arith.constant 2.14748365E+9
-  // CHECK: arith.addf
-  // CHECK: arith.subf
-  // CHECK: arith.cmpf olt
-  // CHECK: select
+  // CHECK: math.roundeven
   // CHECK: arith.minf
   // CHECK: arith.maxf
   // CHECK: arith.fptosi

From 25557aa38a0dab76f5b7a4518942f69d879693c0 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 23 Mar 2023 11:21:35 +0800
Subject: [PATCH 059/208] Recommit [Modules] Remove unnecessary check when
 generating name lookup table in ASTWriter

Close https://github.com/llvm/llvm-project/issues/61065.

We will avoid writing the names from external AST naturally. But
currently its check is often false positive since we may have already
marked the declarations as external but
DeclContext::hasNeedToReconcileExternalVisibleStorage would be false
after reconciling.

Tested with libcxx's modular build.

This patch can improve 8% compilation time in an internal workloads.

See the discussion in
https://reviews.llvm.org/rG1e0709167f5edd330889f51bb203c458bdb5e359
to see the information for recommitting.
---
 clang/include/clang/Serialization/ASTWriter.h |  1 -
 clang/lib/Serialization/ASTWriter.cpp         |  9 +--
 clang/test/Modules/pr61065.cppm               | 55 +++++++++++++++++++
 3 files changed, 56 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/Modules/pr61065.cppm

diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 09ee1744e8945..d31fa38b93825 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -514,7 +514,6 @@ class ASTWriter : public ASTDeserializationListener,
   void WriteTypeAbbrevs();
   void WriteType(QualType T);
 
-  bool isLookupResultExternal(StoredDeclsList &Result, DeclContext *DC);
   bool isLookupResultEntirelyExternal(StoredDeclsList &Result, DeclContext *DC);
 
   void GenerateNameLookupTable(const DeclContext *DC,
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index e8f390bc5b1dd..94160409c5f53 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3849,12 +3849,6 @@ class ASTDeclContextNameLookupTrait {
 
 } // namespace
 
-bool ASTWriter::isLookupResultExternal(StoredDeclsList &Result,
-                                       DeclContext *DC) {
-  return Result.hasExternalDecls() &&
-         DC->hasNeedToReconcileExternalVisibleStorage();
-}
-
 bool ASTWriter::isLookupResultEntirelyExternal(StoredDeclsList &Result,
                                                DeclContext *DC) {
   for (auto *D : Result.getLookupResult())
@@ -3897,8 +3891,7 @@ ASTWriter::GenerateNameLookupTable(const DeclContext *ConstDC,
     // don't need to write an entry for the name at all. If we can't
     // write out a lookup set without performing more deserialization,
     // just skip this entry.
-    if (isLookupResultExternal(Result, DC) &&
-        isLookupResultEntirelyExternal(Result, DC))
+    if (isLookupResultEntirelyExternal(Result, DC))
       continue;
 
     // We also skip empty results. If any of the results could be external and
diff --git a/clang/test/Modules/pr61065.cppm b/clang/test/Modules/pr61065.cppm
new file mode 100644
index 0000000000000..44fa3679974ad
--- /dev/null
+++ b/clang/test/Modules/pr61065.cppm
@@ -0,0 +1,55 @@
+// From https://github.com/llvm/llvm-project/issues/61065
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-module-interface -o %t/b.pcm \
+// RUN:     -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/c.cppm -emit-module-interface -o %t/c.pcm \
+// RUN:     -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/d.cpp -fsyntax-only -verify -fprebuilt-module-path=%t
+
+//--- a.cppm
+export module a;
+
+struct base {
+	base(int) {}
+};
+
+export struct a : base {
+	using base::base;
+};
+
+//--- b.cppm
+export module b;
+
+import a;
+
+a b() {
+	return a(1);
+}
+
+//--- c.cppm
+export module c;
+
+import a;
+import b;
+
+struct noncopyable {
+	noncopyable(noncopyable const &) = delete;
+    noncopyable() = default;
+};
+
+export struct c {
+	noncopyable c0;
+	a c1 = 43;
+    c() = default;
+};
+
+//--- d.cpp
+// expected-no-diagnostics
+import c;
+void d() {
+    c _;
+}

From 45a0433b39ffbd7cee9cc8a92f2300324b3548e0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 22 Mar 2023 21:02:00 -0700
Subject: [PATCH 060/208] [-Wunsafe-buffer-usage] Add [[fallthrough]] after
 D143206

---
 clang/lib/Analysis/UnsafeBufferUsage.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 95e4c8388bc44..4a8358af68ec5 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -973,6 +973,7 @@ PointerDereferenceGadget::getFixits(const Strategy &S) const {
                             endOfOperand.value().getLocWithOffset(1), "[0]")}};
     }
   }
+    [[fallthrough]];
   case Strategy::Kind::Iterator:
   case Strategy::Kind::Array:
   case Strategy::Kind::Vector:

From 1c420cd4e31f68fedca83b4d3a857a5519f4ce03 Mon Sep 17 00:00:00 2001
From: Jun Zhang <jun@junz.org>
Date: Thu, 23 Mar 2023 12:48:59 +0800
Subject: [PATCH 061/208] Precommit test for #61120

Signed-off-by: Jun Zhang <jun@junz.org>
---
 llvm/test/CodeGen/X86/setcc-combine.ll | 560 +++++++++++++++++++++++++
 1 file changed, 560 insertions(+)

diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll
index c2ee78989ba16..f9542a3268ca8 100644
--- a/llvm/test/CodeGen/X86/setcc-combine.ll
+++ b/llvm/test/CodeGen/X86/setcc-combine.ll
@@ -499,3 +499,563 @@ define double @ogt_no_zero(double %x) {
   %r = select i1 %cmp, double %x, double %neg
   ret double %r
 }
+
+define i64 @cmp_sgt_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_sgt_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    setg %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp sgt i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_sgt_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_sgt_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $43, %rdi
+; CHECK-NEXT:    setge %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp sgt i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_sgt_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: cmp_sgt_not_with_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-NEXT:    pxor %xmm2, %xmm0
+; CHECK-NEXT:    pxor %xmm2, %xmm1
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp sgt <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define i64 @cmp_ugt_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_ugt_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addq %rsi, %rdi
+; CHECK-NEXT:    sbbq %rax, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp ugt i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_ugt_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_ugt_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $43, %rdi
+; CHECK-NEXT:    adcq $-1, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp ugt i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_ugt_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: cmp_ugt_not_with_vec:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: cmp_ugt_not_with_vec:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp ugt <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define i64 @cmp_sge_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_sge_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    setge %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp sge i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_sge_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_sge_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $42, %rdi
+; CHECK-NEXT:    setge %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp sge i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_sge_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: cmp_sge_not_with_vec:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: cmp_sge_not_with_vec:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    pmaxud %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp uge <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define i64 @cmp_uge_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_uge_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    adcq $-1, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp uge i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_uge_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_uge_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $42, %rdi
+; CHECK-NEXT:    adcq $-1, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp uge i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_uge_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: cmp_uge_not_with_vec:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: cmp_uge_not_with_vec:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    pmaxud %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp uge <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define i64 @cmp_sle_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_sle_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    setle %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp sle i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_sle_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_sle_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $43, %rdi
+; CHECK-NEXT:    setl %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp sle i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_sle_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: cmp_sle_not_with_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-NEXT:    pxor %xmm2, %xmm0
+; CHECK-NEXT:    pxor %xmm2, %xmm1
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    pxor %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp sle <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define i64 @cmp_slt_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_slt_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    setl %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp slt i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_slt_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_slt_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $42, %rdi
+; CHECK-NEXT:    setl %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp slt i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_slt_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: cmp_slt_not_with_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-NEXT:    pxor %xmm2, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm2
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp slt <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+
+define i64 @cmp_ult_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_ult_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addq %rdi, %rsi
+; CHECK-NEXT:    sbbq %rax, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp ult i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_ult_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_ult_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addq $42, %rdi
+; CHECK-NEXT:    sbbq %rax, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp ult i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_ult_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: cmp_ult_not_with_vec:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: cmp_ult_not_with_vec:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp ult <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define i64 @cmp_ule_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_ule_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rdi, %rsi
+; CHECK-NEXT:    adcq $-1, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp ule i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_ule_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_ule_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $43, %rdi
+; CHECK-NEXT:    sbbq %rax, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp ule i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_ule_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: cmp_ule_not_with_vec:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: cmp_ule_not_with_vec:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    pminud %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp ule <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define i64 @cmp_eq_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_eq_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp eq i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_eq_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_eq_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $-43, %rdi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp eq i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_eq_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: cmp_eq_not_with_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp eq <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+define i64 @cmp_ne_not(i64 %a, i64 %b) {
+; CHECK-LABEL: cmp_ne_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp ne i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_ne_not_with_constant(i64 %a) {
+; CHECK-LABEL: cmp_ne_not_with_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $-43, %rdi
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp ne i64 %na, 42
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <4 x i32> @cmp_ne_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: cmp_ne_not_with_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = icmp ne <4 x i32> %na, %nb
+  %r = sext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define i64 @cmp_uge_not_commute(i64 %b, i64 %a) {
+; CHECK-LABEL: cmp_uge_not_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    notq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq %rdi, %rsi
+; CHECK-NEXT:    adcq $-1, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %nb = xor i64 %b, -1
+  %c = icmp uge i64 %na, %nb
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define i64 @cmp_ult_not_with_constant_commute(i64 %a) {
+; CHECK-LABEL: cmp_ult_not_with_constant_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpq $43, %rdi
+; CHECK-NEXT:    adcq $-1, %rax
+; CHECK-NEXT:    retq
+  %na = xor i64 %a, -1
+  %c = icmp ult i64 42, %a
+  %r = sext i1 %c to i64
+  ret i64 %r
+}
+
+define <2 x i64> @cmp_uge_not_with_vec2xi64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: cmp_uge_not_with_vec2xi64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372034707292159,9223372034707292159]
+; CHECK-NEXT:    pxor %xmm2, %xmm0
+; CHECK-NEXT:    pxor %xmm2, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; CHECK-NEXT:    pcmpeqd %xmm0, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; CHECK-NEXT:    pand %xmm3, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-NEXT:    por %xmm0, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %na = xor <2 x i64> %a, <i64 -1, i64 -1>
+  %nb = xor <2 x i64> %b, <i64 -1, i64 -1>
+  %c = icmp uge <2 x i64> %na, %nb
+  %r = sext <2 x i1> %c to <2 x i64>
+  ret <2 x i64> %r
+}

From b3e12beb44dc36e9ed0f5e9cb3fb1eef0823894e Mon Sep 17 00:00:00 2001
From: Jun Zhang <jun@junz.org>
Date: Thu, 23 Mar 2023 12:49:05 +0800
Subject: [PATCH 062/208] [TLI] Fold ~X >/< ~Y --> Y >/< X

Fixes: https://github.com/llvm/llvm-project/issues/61120

Signed-off-by: Jun Zhang <jun@junz.org>

Differential Revision: https://reviews.llvm.org/D146512
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  17 ++
 llvm/test/CodeGen/X86/cmov.ll                 |   4 +-
 llvm/test/CodeGen/X86/setcc-combine.ll        | 178 +++++++-----------
 3 files changed, 86 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c82f9ce64ea5a..9ef3c15cfe374 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4974,6 +4974,23 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       return DAG.getSetCC(dl, VT, N0, N1, NewCond);
   }
 
+  // ~X > ~Y --> Y > X
+  // ~X < ~Y --> Y < X
+  // ~X < C --> X > ~C
+  // ~X > C --> X < ~C
+  if ((isSignedIntSetCC(Cond) || isUnsignedIntSetCC(Cond)) &&
+      N0.getValueType().isInteger()) {
+    if (isBitwiseNot(N0)) {
+      if (isBitwiseNot(N1))
+        return DAG.getSetCC(dl, VT, N1.getOperand(0), N0.getOperand(0), Cond);
+
+      if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
+        SDValue Not = DAG.getNOT(dl, N1, OpVT);
+        return DAG.getSetCC(dl, VT, Not, N0.getOperand(0), Cond);
+      }
+    }
+  }
+
   if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
       N0.getValueType().isInteger()) {
     if (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB ||
diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll
index 94df5fa6d96fc..dbe85eced6a59 100644
--- a/llvm/test/CodeGen/X86/cmov.ll
+++ b/llvm/test/CodeGen/X86/cmov.ll
@@ -213,10 +213,10 @@ define i64 @test8(i64 %0, i64 %1, i64 %2) {
 define i32 @smin(i32 %x) {
 ; CHECK-LABEL: smin:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %edi
 ; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    notl %edi
 ; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    cmovsl %edi, %eax
+; CHECK-NEXT:    cmovnsl %edi, %eax
 ; CHECK-NEXT:    retq
   %not_x = xor i32 %x, -1
   %1 = icmp slt i32 %not_x, -1
diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll
index f9542a3268ca8..780a769bc9e2b 100644
--- a/llvm/test/CodeGen/X86/setcc-combine.ll
+++ b/llvm/test/CodeGen/X86/setcc-combine.ll
@@ -503,10 +503,8 @@ define double @ogt_no_zero(double %x) {
 define i64 @cmp_sgt_not(i64 %a, i64 %b) {
 ; CHECK-LABEL: cmp_sgt_not:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
-; CHECK-NEXT:    notq %rsi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmpq %rdi, %rsi
 ; CHECK-NEXT:    setg %al
 ; CHECK-NEXT:    negq %rax
 ; CHECK-NEXT:    retq
@@ -520,10 +518,9 @@ define i64 @cmp_sgt_not(i64 %a, i64 %b) {
 define i64 @cmp_sgt_not_with_constant(i64 %a) {
 ; CHECK-LABEL: cmp_sgt_not_with_constant:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq $43, %rdi
-; CHECK-NEXT:    setge %al
+; CHECK-NEXT:    cmpq $-43, %rdi
+; CHECK-NEXT:    setl %al
 ; CHECK-NEXT:    negq %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
@@ -535,10 +532,8 @@ define i64 @cmp_sgt_not_with_constant(i64 %a) {
 define <4 x i32> @cmp_sgt_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: cmp_sgt_not_with_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pxor %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -565,10 +560,9 @@ define i64 @cmp_ugt_not(i64 %a, i64 %b) {
 define i64 @cmp_ugt_not_with_constant(i64 %a) {
 ; CHECK-LABEL: cmp_ugt_not_with_constant:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq $43, %rdi
-; CHECK-NEXT:    adcq $-1, %rax
+; CHECK-NEXT:    cmpq $-43, %rdi
+; CHECK-NEXT:    sbbq %rax, %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
   %c = icmp ugt i64 %na, 42
@@ -579,20 +573,19 @@ define i64 @cmp_ugt_not_with_constant(i64 %a) {
 define <4 x i32> @cmp_ugt_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: cmp_ugt_not_with_vec:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: cmp_ugt_not_with_vec:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm1
-; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pminud %xmm1, %xmm0
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -604,10 +597,8 @@ define <4 x i32> @cmp_ugt_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 define i64 @cmp_sge_not(i64 %a, i64 %b) {
 ; CHECK-LABEL: cmp_sge_not:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
-; CHECK-NEXT:    notq %rsi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmpq %rdi, %rsi
 ; CHECK-NEXT:    setge %al
 ; CHECK-NEXT:    negq %rax
 ; CHECK-NEXT:    retq
@@ -621,10 +612,9 @@ define i64 @cmp_sge_not(i64 %a, i64 %b) {
 define i64 @cmp_sge_not_with_constant(i64 %a) {
 ; CHECK-LABEL: cmp_sge_not_with_constant:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq $42, %rdi
-; CHECK-NEXT:    setge %al
+; CHECK-NEXT:    cmpq $-42, %rdi
+; CHECK-NEXT:    setl %al
 ; CHECK-NEXT:    negq %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
@@ -636,22 +626,18 @@ define i64 @cmp_sge_not_with_constant(i64 %a) {
 define <4 x i32> @cmp_sge_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: cmp_sge_not_with_vec:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: cmp_sge_not_with_vec:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    pmaxud %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -663,10 +649,8 @@ define <4 x i32> @cmp_sge_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 define i64 @cmp_uge_not(i64 %a, i64 %b) {
 ; CHECK-LABEL: cmp_uge_not:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
-; CHECK-NEXT:    notq %rsi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmpq %rdi, %rsi
 ; CHECK-NEXT:    adcq $-1, %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
@@ -679,10 +663,9 @@ define i64 @cmp_uge_not(i64 %a, i64 %b) {
 define i64 @cmp_uge_not_with_constant(i64 %a) {
 ; CHECK-LABEL: cmp_uge_not_with_constant:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq $42, %rdi
-; CHECK-NEXT:    adcq $-1, %rax
+; CHECK-NEXT:    cmpq $-42, %rdi
+; CHECK-NEXT:    sbbq %rax, %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
   %c = icmp uge i64 %na, 42
@@ -693,22 +676,18 @@ define i64 @cmp_uge_not_with_constant(i64 %a) {
 define <4 x i32> @cmp_uge_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: cmp_uge_not_with_vec:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: cmp_uge_not_with_vec:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    pmaxud %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -720,10 +699,8 @@ define <4 x i32> @cmp_uge_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 define i64 @cmp_sle_not(i64 %a, i64 %b) {
 ; CHECK-LABEL: cmp_sle_not:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
-; CHECK-NEXT:    notq %rsi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmpq %rdi, %rsi
 ; CHECK-NEXT:    setle %al
 ; CHECK-NEXT:    negq %rax
 ; CHECK-NEXT:    retq
@@ -737,10 +714,9 @@ define i64 @cmp_sle_not(i64 %a, i64 %b) {
 define i64 @cmp_sle_not_with_constant(i64 %a) {
 ; CHECK-LABEL: cmp_sle_not_with_constant:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq $43, %rdi
-; CHECK-NEXT:    setl %al
+; CHECK-NEXT:    cmpq $-43, %rdi
+; CHECK-NEXT:    setge %al
 ; CHECK-NEXT:    negq %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
@@ -752,11 +728,9 @@ define i64 @cmp_sle_not_with_constant(i64 %a) {
 define <4 x i32> @cmp_sle_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: cmp_sle_not_with_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pxor %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
-; CHECK-NEXT:    pxor %xmm2, %xmm0
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -768,10 +742,8 @@ define <4 x i32> @cmp_sle_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 define i64 @cmp_slt_not(i64 %a, i64 %b) {
 ; CHECK-LABEL: cmp_slt_not:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
-; CHECK-NEXT:    notq %rsi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmpq %rdi, %rsi
 ; CHECK-NEXT:    setl %al
 ; CHECK-NEXT:    negq %rax
 ; CHECK-NEXT:    retq
@@ -785,10 +757,9 @@ define i64 @cmp_slt_not(i64 %a, i64 %b) {
 define i64 @cmp_slt_not_with_constant(i64 %a) {
 ; CHECK-LABEL: cmp_slt_not_with_constant:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq $42, %rdi
-; CHECK-NEXT:    setl %al
+; CHECK-NEXT:    cmpq $-42, %rdi
+; CHECK-NEXT:    setge %al
 ; CHECK-NEXT:    negq %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
@@ -800,11 +771,7 @@ define i64 @cmp_slt_not_with_constant(i64 %a) {
 define <4 x i32> @cmp_slt_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: cmp_slt_not_with_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -845,21 +812,18 @@ define i64 @cmp_ult_not_with_constant(i64 %a) {
 define <4 x i32> @cmp_ult_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: cmp_ult_not_with_vec:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: cmp_ult_not_with_vec:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm1
-; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -871,10 +835,8 @@ define <4 x i32> @cmp_ult_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 define i64 @cmp_ule_not(i64 %a, i64 %b) {
 ; CHECK-LABEL: cmp_ule_not:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
-; CHECK-NEXT:    notq %rsi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq %rdi, %rsi
+; CHECK-NEXT:    cmpq %rsi, %rdi
 ; CHECK-NEXT:    adcq $-1, %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
@@ -887,10 +849,9 @@ define i64 @cmp_ule_not(i64 %a, i64 %b) {
 define i64 @cmp_ule_not_with_constant(i64 %a) {
 ; CHECK-LABEL: cmp_ule_not_with_constant:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq $43, %rdi
-; CHECK-NEXT:    sbbq %rax, %rax
+; CHECK-NEXT:    cmpq $-43, %rdi
+; CHECK-NEXT:    adcq $-1, %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
   %c = icmp ule i64 %na, 42
@@ -901,21 +862,18 @@ define i64 @cmp_ule_not_with_constant(i64 %a) {
 define <4 x i32> @cmp_ule_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: cmp_ule_not_with_vec:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: cmp_ule_not_with_vec:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    pminud %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pminud %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
   %na = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %nb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -1010,10 +968,8 @@ define <4 x i32> @cmp_ne_not_with_vec(<4 x i32> %a, <4 x i32> %b) {
 define i64 @cmp_uge_not_commute(i64 %b, i64 %a) {
 ; CHECK-LABEL: cmp_uge_not_commute:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notq %rsi
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpq %rdi, %rsi
+; CHECK-NEXT:    cmpq %rsi, %rdi
 ; CHECK-NEXT:    adcq $-1, %rax
 ; CHECK-NEXT:    retq
   %na = xor i64 %a, -1
@@ -1039,14 +995,14 @@ define i64 @cmp_ult_not_with_constant_commute(i64 %a) {
 define <2 x i64> @cmp_uge_not_with_vec2xi64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: cmp_uge_not_with_vec2xi64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372034707292159,9223372034707292159]
-; CHECK-NEXT:    pxor %xmm2, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
 ; CHECK-NEXT:    pxor %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT:    pxor %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm2
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    pand %xmm3, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; CHECK-NEXT:    por %xmm0, %xmm1

From 8d93cbed6e383d5a8b5985f9087cb31ffd5ac7f5 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 22 Mar 2023 22:10:20 -0700
Subject: [PATCH 063/208] [InstCombine] Precommit tests

This patch precommits tests for:

https://github.com/llvm/llvm-project/issues/60802
---
 llvm/test/Transforms/InstCombine/bit_ceil.ll | 146 +++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/bit_ceil.ll b/llvm/test/Transforms/InstCombine/bit_ceil.ll
index aa98896aac549..98f4cdb6fb834 100644
--- a/llvm/test/Transforms/InstCombine/bit_ceil.ll
+++ b/llvm/test/Transforms/InstCombine/bit_ceil.ll
@@ -85,6 +85,7 @@ define i32 @bit_ceil_32_plus_1(i32 %x) {
   ret i32 %sel
 }
 
+; std::bit_ceil<uint32_t>(x + 2)
 define i32 @bit_ceil_plus_2(i32 %x) {
 ; CHECK-LABEL: @bit_ceil_plus_2(
 ; CHECK-NEXT:  entry:
@@ -152,5 +153,150 @@ entry:
   ret i32 %sel
 }
 
+; Commuted select operands should still be recognized.
+define i32 @bit_ceil_commuted_operands(i32 %x) {
+; CHECK-LABEL: @bit_ceil_commuted_operands(
+; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[UGT_INV:%.*]] = icmp ugt i32 [[X]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT_INV]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %dec = add i32 %x, -1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %ugt = icmp ule i32 %x, 1
+  %sel = select i1 %ugt, i32 1, i32 %shl
+  ret i32 %sel
+}
+
+; Negative test: wrong select constant
+define i32 @bit_ceil_wrong_select_constant(i32 %x) {
+; CHECK-LABEL: @bit_ceil_wrong_select_constant(
+; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[UGT_INV:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT_INV]], i32 2, i32 [[SHL]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %dec = add i32 %x, -1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %ugt = icmp ugt i32 %x, 1
+  %sel = select i1 %ugt, i32 %shl, i32 2
+  ret i32 %sel
+}
+
+; Negative test: select condition != false does not guarantee ctlz being either 0 or 32
+define i32 @bit_ceil_32_wrong_cond(i32 %x) {
+; CHECK-LABEL: @bit_ceil_32_wrong_cond(
+; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[UGT:%.*]] = icmp ugt i32 [[X]], 2
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %dec = add i32 %x, -1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %ugt = icmp ugt i32 %x, 2
+  %sel = select i1 %ugt, i32 %shl, i32 1
+  ret i32 %sel
+}
+
+; Negative test: wrong sub constant
+define i32 @bit_ceil_wrong_sub_constant(i32 %x) {
+; CHECK-LABEL: @bit_ceil_wrong_sub_constant(
+; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 33, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[UGT:%.*]] = icmp ugt i32 [[X]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %dec = add i32 %x, -1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
+  %sub = sub i32 33, %ctlz
+  %shl = shl i32 1, %sub
+  %ugt = icmp ugt i32 %x, 1
+  %sel = select i1 %ugt, i32 %shl, i32 1
+  ret i32 %sel
+}
+
+; Negative test: the shl result used twice
+define i32 @bit_ceil_32_shl_used_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: @bit_ceil_32_shl_used_twice(
+; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[UGT:%.*]] = icmp ugt i32 [[X]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    store i32 [[SHL]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %dec = add i32 %x, -1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %ugt = icmp ugt i32 %x, 1
+  %sel = select i1 %ugt, i32 %shl, i32 1
+  store i32 %shl, ptr %p, align 4
+  ret i32 %sel
+}
+
+; Negative test: the sub result used twice
+define i32 @bit_ceil_32_sub_used_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: @bit_ceil_32_sub_used_twice(
+; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[UGT:%.*]] = icmp ugt i32 [[X]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %dec = add i32 %x, -1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %ugt = icmp ugt i32 %x, 1
+  %sel = select i1 %ugt, i32 %shl, i32 1
+  store i32 %sub, ptr %p, align 4
+  ret i32 %sel
+}
+
+; a vector version of @bit_ceil_32 above
+define <4 x i32> @bit_ceil_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: @bit_ceil_v4i32(
+; CHECK-NEXT:    [[DEC:%.*]] = add <4 x i32> [[X:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[DEC]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw <4 x i32> <i32 32, i32 32, i32 32, i32 32>, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[SUB]]
+; CHECK-NEXT:    [[UGT:%.*]] = icmp ugt <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[UGT]], <4 x i32> [[SHL]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SEL]]
+;
+  %dec = add <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ctlz = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %dec, i1 false)
+  %sub = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %ctlz
+  %shl = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %sub
+  %ugt = icmp ugt <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %sel = select <4 x i1> %ugt, <4 x i32> %shl, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %sel
+}
+
 declare i32 @llvm.ctlz.i32(i32, i1 immarg)
 declare i64 @llvm.ctlz.i64(i64, i1 immarg)
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)

From 0d19e583f1066935de5d0e9c55ee4a4f78649e23 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 22 Mar 2023 22:10:22 -0700
Subject: [PATCH 064/208] [InstCombine] Precommit a test

This patch precommits a test for:

https://github.com/llvm/llvm-project/issues/61183
---
 llvm/test/Transforms/InstCombine/bit_floor.ll | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/bit_floor.ll b/llvm/test/Transforms/InstCombine/bit_floor.ll
index 0ef7fe3d22e0f..d436e53eb4504 100644
--- a/llvm/test/Transforms/InstCombine/bit_floor.ll
+++ b/llvm/test/Transforms/InstCombine/bit_floor.ll
@@ -39,5 +39,26 @@ define i64 @bit_floor_64(i64 %x) {
   ret i64 %sel
 }
 
+; a vector version of @bit_floor_32 above
+define <4 x i32> @bit_floor_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: @bit_floor_v4i32(
+; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq <4 x i32> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw <4 x i32> <i32 32, i32 32, i32 32, i32 32>, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[SUB]]
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[EQ0]], <4 x i32> zeroinitializer, <4 x i32> [[SHL]]
+; CHECK-NEXT:    ret <4 x i32> [[SEL]]
+;
+  %eq0 = icmp eq <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
+  %lshr = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %ctlz = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %lshr, i1 false)
+  %sub = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %ctlz
+  %shl = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %sub
+  %sel = select <4 x i1> %eq0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %shl
+  ret <4 x i32> %sel
+}
+
 declare i32 @llvm.ctlz.i32(i32, i1 immarg)
 declare i64 @llvm.ctlz.i64(i64, i1 immarg)
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)

From 4524db7316b2db9a999b2894a047799dfb6c5cf6 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 22 Mar 2023 22:10:23 -0700
Subject: [PATCH 065/208] [ARM] Use isNullConstant (NFC)

---
 llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp     | 10 +++-------
 llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp |  5 ++---
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index fbf688de637b5..efacc8b8f3c0e 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2720,10 +2720,7 @@ void ARMDAGToDAGISel::SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated,
   }
 
   auto OpIsZero = [N](size_t OpNo) {
-    if (ConstantSDNode *OpConst = dyn_cast<ConstantSDNode>(N->getOperand(OpNo)))
-      if (OpConst->getZExtValue() == 0)
-        return true;
-    return false;
+    return isNullConstant(N->getOperand(OpNo));
   };
 
   // If the input accumulator value is not zero, select an instruction with
@@ -3990,10 +3987,9 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
     SDValue SmulLoHi = N->getOperand(1);
     SDValue Subc = N->getOperand(2);
-    auto *Zero = dyn_cast<ConstantSDNode>(Subc.getOperand(0));
+    SDValue Zero = Subc.getOperand(0);
 
-    if (!Zero || Zero->getZExtValue() != 0 ||
-        Subc.getOperand(1) != SmulLoHi.getValue(0) ||
+    if (!isNullConstant(Zero) || Subc.getOperand(1) != SmulLoHi.getValue(0) ||
         N->getOperand(1) != SmulLoHi.getValue(1) ||
         N->getOperand(2) != Subc.getValue(1))
       break;
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 913724daf0ad6..c57825949c1ce 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -65,9 +65,8 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
     break;
   case RTLIB::MEMSET:
     AEABILibcall = AEABI_MEMSET;
-    if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
-      if (ConstantSrc->getZExtValue() == 0)
-        AEABILibcall = AEABI_MEMCLR;
+    if (isNullConstant(Src))
+      AEABILibcall = AEABI_MEMCLR;
     break;
   default:
     return SDValue();

From 7bb6d1b32ea9972277201b7651086ab9faffc557 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 22 Mar 2023 22:10:25 -0700
Subject: [PATCH 066/208] [llvm] Skip getAPIntValue (NFC)

ConstantSDNode provides some convenience functions like isZero,
getZExtValue, and isMinSignedValue that are named identically to those
provided by APInt, so we can "skip" getAPIntValue.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp    | 4 ++--
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 7 +++----
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp    | 4 ++--
 llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp          | 3 +--
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp      | 2 +-
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4a34d4724ae8f..cc722bcc8c2b3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4105,7 +4105,7 @@ SDValue DAGCombiner::visitSUBO(SDNode *N) {
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
 
   // fold (subox, c) -> (addo x, -c)
-  if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) {
+  if (IsSigned && N1C && !N1C->isMinSignedValue()) {
     return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0,
                        DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
   }
@@ -4585,7 +4585,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     return DAG.getNegative(N0, DL, VT);
 
   // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
-  if (N1C && N1C->getAPIntValue().isMinSignedValue())
+  if (N1C && N1C->isMinSignedValue())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
                          DAG.getConstant(1, DL, VT),
                          DAG.getConstant(0, DL, VT));
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9ef3c15cfe374..b7b67a20bc9e9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3908,8 +3908,7 @@ SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck(
 SDValue TargetLowering::optimizeSetCCByHoistingAndByConstFromLogicalShift(
     EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
     DAGCombinerInfo &DCI, const SDLoc &DL) const {
-  assert(isConstOrConstSplat(N1C) &&
-         isConstOrConstSplat(N1C)->getAPIntValue().isZero() &&
+  assert(isConstOrConstSplat(N1C) && isConstOrConstSplat(N1C)->isZero() &&
          "Should be a comparison with 0.");
   assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
          "Valid only for [in]equality comparisons.");
@@ -4738,8 +4737,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // For example, when high 32-bits of i64 X are known clear:
       // all bits clear: (X | (Y<<32)) ==  0 --> (X | Y) ==  0
       // all bits set:   (X | (Y<<32)) == -1 --> (X & Y) == -1
-      bool CmpZero = N1C->getAPIntValue().isZero();
-      bool CmpNegOne = N1C->getAPIntValue().isAllOnes();
+      bool CmpZero = N1C->isZero();
+      bool CmpNegOne = N1C->isAllOnes();
       if ((CmpZero || CmpNegOne) && N0.hasOneUse()) {
         // Match or(lo,shl(hi,bw/2)) pattern.
         auto IsConcat = [&](SDValue V, SDValue &Lo, SDValue &Hi) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c55b2e4d8fa14..6214c3e935ec4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2770,7 +2770,7 @@ bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
   assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
 
   unsigned Mods = SISrcMods::OP_SEL_1;
-  unsigned SrcSign = C->getAPIntValue().getZExtValue();
+  unsigned SrcSign = C->getZExtValue();
   if (SrcSign == 1)
     Mods ^= SISrcMods::NEG;
 
@@ -2784,7 +2784,7 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
   assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
 
   unsigned Mods = SISrcMods::OP_SEL_1;
-  unsigned SrcVal = C->getAPIntValue().getZExtValue();
+  unsigned SrcVal = C->getZExtValue();
   if (SrcVal == 1)
     Mods |= SISrcMods::OP_SEL_0;
 
diff --git a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 03015a457a0d1..6ea8e200bd4e9 100644
--- a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -275,8 +275,7 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(
       }
 
       if (ImmNode->getValueType(0) != MVT::i8) {
-        Disp = CurDAG->getTargetConstant(
-            ImmNode->getAPIntValue().getZExtValue(), dl, MVT::i8);
+        Disp = CurDAG->getTargetConstant(ImmNode->getZExtValue(), dl, MVT::i8);
       } else {
         Disp = ImmOp;
       }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 03a387570e3c6..7670d4d41cd86 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18149,7 +18149,7 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
     if (Flags & PPC::MOF_RPlusSImm16) {
       SDValue Op0 = N.getOperand(0);
       SDValue Op1 = N.getOperand(1);
-      int16_t Imm = cast<ConstantSDNode>(Op1)->getAPIntValue().getZExtValue();
+      int16_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue();
       if (!Align || isAligned(*Align, Imm)) {
         Disp = DAG.getTargetConstant(Imm, DL, N.getValueType());
         Base = Op0;

From fd29a4d24267eef0f11d238cb4a32b07d56d6c5c Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Wed, 22 Mar 2023 13:13:27 -0700
Subject: [PATCH 067/208] [Pseudo Probe] Use the name from debug info to
 compute GUID in probe desc

This is to fix a GUID mismatch while decoding pseudo probe, a GUID from the inline tree is not in the GUID2FuncDescMap. It turned out that frontend could change the function name making it different from the one in debug info(https://reviews.llvm.org/D111009). Here change to use the function name from debug info to be consistent with the probe name from the inline stack.

Reviewed By: hoy, wenlei

Differential Revision: https://reviews.llvm.org/D146657
---
 llvm/include/llvm/IR/MDBuilder.h               |  2 +-
 llvm/lib/IR/MDBuilder.cpp                      |  4 ++--
 llvm/lib/Transforms/IPO/SampleProfileProbe.cpp | 15 +++++++++++----
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/IR/MDBuilder.h b/llvm/include/llvm/IR/MDBuilder.h
index bd542bd0d2b2b..39165453de16b 100644
--- a/llvm/include/llvm/IR/MDBuilder.h
+++ b/llvm/include/llvm/IR/MDBuilder.h
@@ -78,7 +78,7 @@ class MDBuilder {
   MDNode *createFunctionSectionPrefix(StringRef Prefix);
 
   /// Return metadata containing the pseudo probe descriptor for a function.
-  MDNode *createPseudoProbeDesc(uint64_t GUID, uint64_t Hash, Function *F);
+  MDNode *createPseudoProbeDesc(uint64_t GUID, uint64_t Hash, StringRef FName);
 
   /// Return metadata containing llvm statistics.
   MDNode *
diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp
index 38ab1d3d10244..2490b3012bdc2 100644
--- a/llvm/lib/IR/MDBuilder.cpp
+++ b/llvm/lib/IR/MDBuilder.cpp
@@ -336,12 +336,12 @@ MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
 }
 
 MDNode *MDBuilder::createPseudoProbeDesc(uint64_t GUID, uint64_t Hash,
-                                         Function *F) {
+                                         StringRef FName) {
   auto *Int64Ty = Type::getInt64Ty(Context);
   SmallVector<Metadata *, 3> Ops(3);
   Ops[0] = createConstant(ConstantInt::get(Int64Ty, GUID));
   Ops[1] = createConstant(ConstantInt::get(Int64Ty, Hash));
-  Ops[2] = createString(F->getName());
+  Ops[2] = createString(FName);
   return MDNode::get(Context, Ops);
 }
 
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 7a40ddee81798..ed1d5575db69a 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -286,9 +286,16 @@ uint32_t SampleProfileProber::getCallsiteId(const Instruction *Call) const {
 void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
   Module *M = F.getParent();
   MDBuilder MDB(F.getContext());
-  // Compute a GUID without considering the function's linkage type. This is
-  // fine since function name is the only key in the profile database.
-  uint64_t Guid = Function::getGUID(F.getName());
+  // Since the GUID from probe desc and inline stack are computed seperately, we
+  // need to make sure their names are consistent, so here also use the name
+  // from debug info.
+  StringRef FName = F.getName();
+  if (auto *SP = F.getSubprogram()) {
+    FName = SP->getLinkageName();
+    if (FName.empty())
+      FName = SP->getName();
+  }
+  uint64_t Guid = Function::getGUID(FName);
 
   // Assign an artificial debug line to a probe that doesn't come with a real
   // line. A probe not having a debug line will get an incomplete inline
@@ -371,7 +378,7 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
   // - FunctionHash.
   // - FunctionName
   auto Hash = getFunctionHash();
-  auto *MD = MDB.createPseudoProbeDesc(Guid, Hash, &F);
+  auto *MD = MDB.createPseudoProbeDesc(Guid, Hash, FName);
   auto *NMD = M->getNamedMetadata(PseudoProbeDescMetadataName);
   assert(NMD && "llvm.pseudo_probe_desc should be pre-created");
   NMD->addOperand(MD);

From c2df1d8a6d1cab95637a3b40d49a15e535135b33 Mon Sep 17 00:00:00 2001
From: "Wu, Yingcong" <yingcong.wu@intel.com>
Date: Wed, 22 Mar 2023 21:49:19 -0700
Subject: [PATCH 068/208] [libfuzzer] add test of cov file-id in control file

There is test for ft file-id in control file, but no test for cov line.
Without the test, a invalid cov file-id would cause crash.

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D145672
---
 compiler-rt/lib/fuzzer/FuzzerMerge.cpp          |  8 ++++++--
 compiler-rt/test/fuzzer/merge-control-file.test | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/fuzzer/FuzzerMerge.cpp b/compiler-rt/lib/fuzzer/FuzzerMerge.cpp
index 24bd11958e807..8c8806e8aafd3 100644
--- a/compiler-rt/lib/fuzzer/FuzzerMerge.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerMerge.cpp
@@ -77,6 +77,7 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
   size_t ExpectedStartMarker = 0;
   const size_t kInvalidStartMarker = -1;
   size_t LastSeenStartMarker = kInvalidStartMarker;
+  bool HaveFtMarker = true;
   std::vector<uint32_t> TmpFeatures;
   std::set<uint32_t> PCs;
   while (std::getline(IS, Line, '\n')) {
@@ -93,12 +94,13 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
       LastSeenStartMarker = ExpectedStartMarker;
       assert(ExpectedStartMarker < Files.size());
       ExpectedStartMarker++;
+      HaveFtMarker = false;
     } else if (Marker == "FT") {
       // FT FILE_ID COV1 COV2 COV3 ...
       size_t CurrentFileIdx = N;
       if (CurrentFileIdx != LastSeenStartMarker)
         return false;
-      LastSeenStartMarker = kInvalidStartMarker;
+      HaveFtMarker = true;
       if (ParseCoverage) {
         TmpFeatures.clear();  // use a vector from outer scope to avoid resizes.
         while (ISS1 >> N)
@@ -108,6 +110,8 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
       }
     } else if (Marker == "COV") {
       size_t CurrentFileIdx = N;
+      if (CurrentFileIdx != LastSeenStartMarker)
+        return false;
       if (ParseCoverage)
         while (ISS1 >> N)
           if (PCs.insert(N).second)
@@ -116,7 +120,7 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
       return false;
     }
   }
-  if (LastSeenStartMarker != kInvalidStartMarker)
+  if (!HaveFtMarker && LastSeenStartMarker != kInvalidStartMarker)
     LastFailure = Files[LastSeenStartMarker].Name;
 
   FirstNotProcessedFile = ExpectedStartMarker;
diff --git a/compiler-rt/test/fuzzer/merge-control-file.test b/compiler-rt/test/fuzzer/merge-control-file.test
index ebd2cf5af3baa..c7d666ea471e9 100644
--- a/compiler-rt/test/fuzzer/merge-control-file.test
+++ b/compiler-rt/test/fuzzer/merge-control-file.test
@@ -50,3 +50,17 @@ RUN: echo STARTED 2 2 >> %t/MCF
 RUN: echo FT 2 13 >> %t/MCF
 RUN: %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=OK_3
 OK_3: MERGE-OUTER: nothing to do, merge has been completed before
+
+# Test for invalid COV file_id
+RUN: rm -f %t/T1/*; cp %t/T0/* %t/T1
+RUN: echo 3 > %t/MCF; echo 0 >> %t/MCF; echo %t/T1/1 >> %t/MCF; echo %t/T1/2 >> %t/MCF; echo %t/T1/3 >> %t/MCF
+RUN: echo STARTED 0 1 >> %t/MCF
+RUN: echo FT 0 11 >> %t/MCF
+RUN: echo STARTED 1 2 >> %t/MCF
+RUN: echo FT 1 12 >> %t/MCF
+RUN: echo STARTED 2 2 >> %t/MCF
+RUN: echo FT 2 13 >> %t/MCF
+# Invalid file-id 21 here
+RUN: echo COV 21 13 >> %t/MCF
+RUN: %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=COV_INVALID
+COV_INVALID: MERGE-OUTER: bad control file, will overwrite it

From 021edda0b20468e20a72b1788721b2b70228bffb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 22 Mar 2023 23:00:06 -0700
Subject: [PATCH 069/208] [TableGen] Simplify CodeGenHwModes constructor. NFC

Remove the loop that erases the DefaultMode from the Record vector.
Instead we can skip over in the loop that creates HwMode objects.
---
 llvm/utils/TableGen/CodeGenHwModes.cpp | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenHwModes.cpp b/llvm/utils/TableGen/CodeGenHwModes.cpp
index 99a97e89e60c5..d8652dfa121f3 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.cpp
+++ b/llvm/utils/TableGen/CodeGenHwModes.cpp
@@ -65,23 +65,16 @@ void HwModeSelect::dump() const {
 }
 
 CodeGenHwModes::CodeGenHwModes(RecordKeeper &RK) : Records(RK) {
-  std::vector<Record*> MRs = Records.getAllDerivedDefinitions("HwMode");
-  // The default mode needs a definition in the .td sources for TableGen
-  // to accept references to it. We need to ignore the definition here.
-  for (auto I = MRs.begin(), E = MRs.end(); I != E; ++I) {
-    if ((*I)->getName() != DefaultModeName)
+  for (Record *R : Records.getAllDerivedDefinitions("HwMode")) {
+    // The default mode needs a definition in the .td sources for TableGen
+    // to accept references to it. We need to ignore the definition here.
+    if (R->getName() == DefaultModeName)
       continue;
-    MRs.erase(I);
-    break;
-  }
-
-  for (Record *R : MRs) {
-    Modes.emplace_back(R);
+    Modes.push_back(R);
     ModeIds.insert(std::make_pair(R, Modes.size()));
   }
 
-  std::vector<Record*> MSs = Records.getAllDerivedDefinitions("HwModeSelect");
-  for (Record *R : MSs) {
+  for (Record *R : Records.getAllDerivedDefinitions("HwModeSelect")) {
     auto P = ModeSelects.emplace(std::make_pair(R, HwModeSelect(R, *this)));
     assert(P.second);
     (void)P;

From b3256047d01f3cc57a617d984612d63b28998de7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 22 Mar 2023 23:17:15 -0700
Subject: [PATCH 070/208] [TableGen] Change push_back back to emplace_back.

This was a mistake I made in 021edda0b20468e20a72b1788721b2b70228bffb.
---
 llvm/utils/TableGen/CodeGenHwModes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/TableGen/CodeGenHwModes.cpp b/llvm/utils/TableGen/CodeGenHwModes.cpp
index d8652dfa121f3..2171507f4c63f 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.cpp
+++ b/llvm/utils/TableGen/CodeGenHwModes.cpp
@@ -70,7 +70,7 @@ CodeGenHwModes::CodeGenHwModes(RecordKeeper &RK) : Records(RK) {
     // to accept references to it. We need to ignore the definition here.
     if (R->getName() == DefaultModeName)
       continue;
-    Modes.push_back(R);
+    Modes.emplace_back(R);
     ModeIds.insert(std::make_pair(R, Modes.size()));
   }
 

From f9f4767af9f3d89792d67ae8c5f65913ff263b89 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 22 Mar 2023 23:27:59 -0700
Subject: [PATCH 071/208] [InstCombine] Precommit tests

This patch precommits tests for:

https://github.com/llvm/llvm-project/issues/61183
---
 llvm/test/Transforms/InstCombine/bit_floor.ll | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/bit_floor.ll b/llvm/test/Transforms/InstCombine/bit_floor.ll
index d436e53eb4504..9daa8eee8969c 100644
--- a/llvm/test/Transforms/InstCombine/bit_floor.ll
+++ b/llvm/test/Transforms/InstCombine/bit_floor.ll
@@ -39,6 +39,114 @@ define i64 @bit_floor_64(i64 %x) {
   ret i64 %sel
 }
 
+; Commutted select operands should still be recognized.
+define i32 @bit_floor_commuted_operands(i32 %x) {
+; CHECK-LABEL: @bit_floor_commuted_operands(
+; CHECK-NEXT:    [[NE0_NOT:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[NE0_NOT]], i32 0, i32 [[SHL]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %ne0 = icmp ne i32 %x, 0
+  %lshr = lshr i32 %x, 1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %lshr, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %sel = select i1 %ne0, i32 %shl, i32 0
+  ret i32 %sel
+}
+
+; Negative test: lshr used twice
+define i32 @bit_floor_lshr_used_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: @bit_floor_lshr_used_twice(
+; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
+; CHECK-NEXT:    store i32 [[LSHR]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %eq0 = icmp eq i32 %x, 0
+  %lshr = lshr i32 %x, 1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %lshr, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %sel = select i1 %eq0, i32 0, i32 %shl
+  store i32 %lshr, ptr %p, align 4
+  ret i32 %sel
+}
+
+; Negative test: ctlz used twice
+define i32 @bit_floor_ctlz_used_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: @bit_floor_ctlz_used_twice(
+; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
+; CHECK-NEXT:    store i32 [[CTLZ]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %eq0 = icmp eq i32 %x, 0
+  %lshr = lshr i32 %x, 1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %lshr, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %sel = select i1 %eq0, i32 0, i32 %shl
+  store i32 %ctlz, ptr %p, align 4
+  ret i32 %sel
+}
+
+; Negative test: sub used twice
+define i32 @bit_floor_sub_used_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: @bit_floor_sub_used_twice(
+; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %eq0 = icmp eq i32 %x, 0
+  %lshr = lshr i32 %x, 1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %lshr, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %sel = select i1 %eq0, i32 0, i32 %shl
+  store i32 %sub, ptr %p, align 4
+  ret i32 %sel
+}
+
+; Negative test: shl used twice
+define i32 @bit_floor_shl_used_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: @bit_floor_shl_used_twice(
+; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 1
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
+; CHECK-NEXT:    store i32 [[SHL]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %eq0 = icmp eq i32 %x, 0
+  %lshr = lshr i32 %x, 1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %lshr, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %sel = select i1 %eq0, i32 0, i32 %shl
+  store i32 %shl, ptr %p, align 4
+  ret i32 %sel
+}
+
 ; a vector version of @bit_floor_32 above
 define <4 x i32> @bit_floor_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: @bit_floor_v4i32(

From a35f9c6e0813c92e5efb0aac3e2681c7e1651f7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Degioanni?= <theo.degioanni@nextsilicon.com>
Date: Thu, 23 Mar 2023 07:58:37 +0100
Subject: [PATCH 072/208] [mlir][llvm] Add poison constant.

This patch introduces the poison constant from LLVM in the LLVM IR dialect. It also adds import and export support for it, along with roundtrip tests.

Reviewed By: gysit

Differential Revision: https://reviews.llvm.org/D146631
---
 mlir/docs/Dialects/LLVM.md                    |  1 +
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   | 24 ++++++++++++++++++-
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       |  6 +++++
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       |  8 +++++--
 .../test/Target/LLVMIR/Import/instructions.ll |  3 +++
 mlir/test/Target/LLVMIR/llvmir.mlir           |  3 +++
 6 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md
index 53d4dfbf686e7..c41d7254a378c 100644
--- a/mlir/docs/Dialects/LLVM.md
+++ b/mlir/docs/Dialects/LLVM.md
@@ -105,6 +105,7 @@ values for thread-safety and concept parsimony reasons. Instead, regular values
 are produced by dedicated operations that have the corresponding semantics:
 [`llvm.mlir.constant`](#llvmmlirconstant-mlirllvmconstantop),
 [`llvm.mlir.undef`](#llvmmlirundef-mlirllvmundefop),
+[`llvm.mlir.poison`](#llvmmlirpoison-mlirllvmpoisonop),
 [`llvm.mlir.null`](#llvmmlirnull-mlirllvmnullop). Note how these operations are
 prefixed with `mlir.` to indicate that they don't belong to LLVM IR but are only
 necessary to model it in MLIR. The values produced by these operations are
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 1bbc32f3d2917..3abe1614f66fa 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1652,7 +1652,7 @@ def LLVM_UndefOp : LLVM_Op<"mlir.undef", [Pure]>,
     Unlike LLVM IR, MLIR does not have first-class undefined values. Such values
     must be created as SSA values using `llvm.mlir.undef`. This operation has no
     operands or attributes. It creates an undefined value of the specified LLVM
-    IR dialect type wrapping an LLVM IR structure type.
+    IR dialect type.
 
     Example:
 
@@ -1666,6 +1666,28 @@ def LLVM_UndefOp : LLVM_Op<"mlir.undef", [Pure]>,
   let assemblyFormat = "attr-dict `:` type($res)";
 }
 
+def LLVM_PoisonOp : LLVM_Op<"mlir.poison", [Pure]>,
+                    LLVM_Builder<"$res = llvm::PoisonValue::get($_resultType);"> {
+  let summary = "Creates a poison value of LLVM dialect type.";
+  let description = [{
+    Unlike LLVM IR, MLIR does not have first-class poison values. Such values
+    must be created as SSA values using `llvm.mlir.poison`. This operation has
+    no operands or attributes. It creates a poison value of the specified LLVM
+    IR dialect type.
+
+    Example:
+
+    ```mlir
+    // Create a poison value for a structure with a 32-bit integer followed
+    // by a float.
+    %0 = llvm.mlir.poison : !llvm.struct<(i32, f32)>
+    ```
+  }];
+  let results = (outs LLVM_Type:$res);
+  let builders = [LLVM_OneResultOpBuilder];
+  let assemblyFormat = "attr-dict `:` type($res)";
+}
+
 def LLVM_ConstantOp
     : LLVM_Op<"mlir.constant", [Pure, ConstantLike]>,
       LLVM_Builder<[{$res = getLLVMConstant($_resultType, $value, $_location,
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index d3ac7dcc17554..707f28d6c3641 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1036,6 +1036,12 @@ FailureOr<Value> ModuleImport::convertConstant(llvm::Constant *constant) {
     return builder.create<NullOp>(loc, type).getResult();
   }
 
+  // Convert poison.
+  if (auto *poisonVal = dyn_cast<llvm::PoisonValue>(constant)) {
+    Type type = convertType(poisonVal->getType());
+    return builder.create<PoisonOp>(loc, type).getResult();
+  }
+
   // Convert undef.
   if (auto *undefVal = dyn_cast<llvm::UndefValue>(constant)) {
     Type type = convertType(undefVal->getType());
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 9147027c9d4b2..c495a36bc0678 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -438,10 +438,14 @@ llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personali
 func.func @useFreezeOp(%arg0: i32) {
   // CHECK:  = llvm.freeze %[[ARG0:.*]] : i32
   %0 = llvm.freeze %arg0 : i32
-  // CHECK: %[[x:.*]] = llvm.mlir.undef : i8
+  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef : i8
   %1 = llvm.mlir.undef : i8
-  // CHECK:  = llvm.freeze %[[x]] : i8
+  // CHECK:  = llvm.freeze %[[UNDEF]] : i8
   %2 = llvm.freeze %1 : i8
+  // CHECK: %[[POISON:.*]] = llvm.mlir.poison : i8
+  %3 = llvm.mlir.poison : i8
+  // CHECK:  = llvm.freeze %[[POISON]] : i8
+  %4 = llvm.freeze %3 : i8
   return
 }
 
diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll
index cbdb0ebe295ef..3f5ade4f15735 100644
--- a/mlir/test/Target/LLVMIR/Import/instructions.ll
+++ b/mlir/test/Target/LLVMIR/Import/instructions.ll
@@ -523,10 +523,13 @@ define void @gep_dynamic_idx(ptr %ptr, i32 %idx) {
 ; CHECK-SAME:  %[[ARG1:[a-zA-Z0-9]+]]
 define void @freeze(i32 %arg1) {
   ; CHECK:  %[[UNDEF:[0-9]+]] = llvm.mlir.undef : i64
+  ; CHECK:  %[[POISON:[0-9]+]] = llvm.mlir.poison : i16
   ; CHECK:  llvm.freeze %[[ARG1]] : i32
   ; CHECK:  llvm.freeze %[[UNDEF]] : i64
+  ; CHECK:  llvm.freeze %[[POISON]] : i16
   %1 = freeze i32 %arg1
   %2 = freeze i64 undef
+  %3 = freeze i16 poison
   ret void
 }
 
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 46120cb348296..6d340bc57fcd1 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -1618,6 +1618,9 @@ llvm.func @callFreezeOp(%x : i32) {
   %1 = llvm.mlir.undef : i32
   // CHECK: freeze i32 undef
   %2 = llvm.freeze %1 : i32
+  %3 = llvm.mlir.poison : i32
+  // CHECK: freeze i32 poison
+  %4 = llvm.freeze %3 : i32
   llvm.return
 }
 

From c5d22f4e1866549f0d8c3aad598c0153c31679e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 23 Mar 2023 08:22:23 +0100
Subject: [PATCH 073/208] [clang][Sema][NFC] Make local variable const

---
 clang/lib/Sema/SemaDeclAttr.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 229e73618c53c..efa275c0aa12b 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3759,7 +3759,7 @@ static void handleEnumExtensibilityAttr(Sema &S, Decl *D,
 /// Handle __attribute__((format_arg((idx)))) attribute based on
 /// http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
 static void handleFormatArgAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  Expr *IdxExpr = AL.getArgAsExpr(0);
+  const Expr *IdxExpr = AL.getArgAsExpr(0);
   ParamIdx Idx;
   if (!checkFunctionOrMethodParameterIndex(S, D, AL, 1, IdxExpr, Idx))
     return;

From 79df1a785d8e45a00906709fc403ba4bd5af4f66 Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi@nextsilicon.com>
Date: Thu, 23 Mar 2023 08:13:54 +0100
Subject: [PATCH 074/208] [mlir][llvm] Switch rountrip tests to opaque
 pointers.

The revision switches all remaining LLVM dialect roundtrip
tests to opaque pointers. Selected tests are copied to a
postfixed test file for the time being.

Part of https://discourse.llvm.org/t/rfc-switching-the-llvm-dialect-and-dialect-lowerings-to-opaque-pointers/68179

Reviewed By: zero9178

Differential Revision: https://reviews.llvm.org/D146639
---
 .../LLVMIR/roundtrip-typed-pointers.mlir      |  73 +++++++
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       | 187 ++++++++----------
 2 files changed, 150 insertions(+), 110 deletions(-)
 create mode 100644 mlir/test/Dialect/LLVMIR/roundtrip-typed-pointers.mlir

diff --git a/mlir/test/Dialect/LLVMIR/roundtrip-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/roundtrip-typed-pointers.mlir
new file mode 100644
index 0000000000000..7cc5a6deee541
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/roundtrip-typed-pointers.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @ops
+// CHECK-SAME:  %[[I32:.*]]: i32
+func.func @ops(%arg0: i32) {
+// Memory-related operations.
+//
+// CHECK-NEXT:  %[[ALLOCA:.*]] = llvm.alloca %[[I32]] x f64 : (i32) -> !llvm.ptr<f64>
+// CHECK-NEXT:  %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][%[[I32]], %[[I32]]] : (!llvm.ptr<f64>, i32, i32) -> !llvm.ptr<f64>
+// CHECK-NEXT:  %[[VALUE:.*]] = llvm.load %[[GEP]] : !llvm.ptr<f64>
+// CHECK-NEXT:  llvm.store %[[VALUE]], %[[ALLOCA]] : !llvm.ptr<f64>
+// CHECK-NEXT:  %{{.*}} = llvm.bitcast %[[ALLOCA]] : !llvm.ptr<f64> to !llvm.ptr<i64>
+  %13 = llvm.alloca %arg0 x f64 : (i32) -> !llvm.ptr<f64>
+  %14 = llvm.getelementptr %13[%arg0, %arg0] : (!llvm.ptr<f64>, i32, i32) -> !llvm.ptr<f64>
+  %15 = llvm.load %14 : !llvm.ptr<f64>
+  llvm.store %15, %13 : !llvm.ptr<f64>
+  %16 = llvm.bitcast %13 : !llvm.ptr<f64> to !llvm.ptr<i64>
+  llvm.return
+}
+
+// CHECK-LABEL: @gep
+llvm.func @gep(%ptr: !llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, %idx: i64,
+               %ptr2: !llvm.ptr<struct<(array<10 x f32>)>>) {
+  // CHECK: llvm.getelementptr %{{.*}}[%{{.*}}, 1, 0] : (!llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, i64) -> !llvm.ptr<i32>
+  llvm.getelementptr %ptr[%idx, 1, 0] : (!llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, i64) -> !llvm.ptr<i32>
+  // CHECK: llvm.getelementptr inbounds %{{.*}}[%{{.*}}, 0, %{{.*}}] : (!llvm.ptr<struct<(array<10 x f32>)>>, i64, i64) -> !llvm.ptr<f32>
+  llvm.getelementptr inbounds %ptr2[%idx, 0, %idx] : (!llvm.ptr<struct<(array<10 x f32>)>>, i64, i64) -> !llvm.ptr<f32>
+  llvm.return
+}
+
+// CHECK-LABEL: @alloca
+func.func @alloca(%size : i64) {
+  // CHECK: llvm.alloca %{{.*}} x i32 : (i64) -> !llvm.ptr<i32>
+  llvm.alloca %size x i32 {alignment = 0} : (i64) -> (!llvm.ptr<i32>)
+  // CHECK: llvm.alloca inalloca %{{.*}} x i32 {alignment = 8 : i64} : (i64) -> !llvm.ptr<i32>
+  llvm.alloca inalloca %size x i32 {alignment = 8} : (i64) -> (!llvm.ptr<i32>)
+  llvm.return
+}
+
+// CHECK-LABEL: @null
+func.func @null() {
+  // CHECK: llvm.mlir.null : !llvm.ptr<i8>
+  %0 = llvm.mlir.null : !llvm.ptr<i8>
+  // CHECK: llvm.mlir.null : !llvm.ptr<struct<(ptr<func<void (i32, ptr<func<void ()>>)>>, i64)>>
+  %1 = llvm.mlir.null : !llvm.ptr<struct<(ptr<func<void (i32, ptr<func<void ()>>)>>, i64)>>
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @vararg_func
+llvm.func @vararg_func(%arg0: i32, ...) {
+  // CHECK: %{{.*}} = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %{{.*}} = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA0:.+]] = llvm.alloca %{{.*}} x !llvm.struct<"struct.va_list", (ptr<i8>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>>
+  // CHECK: %[[CAST0:.+]] = llvm.bitcast %[[ALLOCA0]] : !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>> to !llvm.ptr<i8>
+  %2 = llvm.alloca %1 x !llvm.struct<"struct.va_list", (ptr<i8>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>>
+  %3 = llvm.bitcast %2 : !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>> to !llvm.ptr<i8>
+  // CHECK: llvm.intr.vastart %[[CAST0]]
+  llvm.intr.vastart %3 : !llvm.ptr<i8>
+  // CHECK: %[[ALLOCA1:.+]] = llvm.alloca %{{.*}} x !llvm.ptr<i8> {alignment = 8 : i64} : (i32) -> !llvm.ptr<ptr<i8>>
+  // CHECK: %[[CAST1:.+]] = llvm.bitcast %[[ALLOCA1]] : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
+  %4 = llvm.alloca %0 x !llvm.ptr<i8> {alignment = 8 : i64} : (i32) -> !llvm.ptr<ptr<i8>>
+  %5 = llvm.bitcast %4 : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
+  // CHECK: llvm.intr.vacopy %[[CAST0]] to %[[CAST1]]
+  llvm.intr.vacopy %3 to %5 : !llvm.ptr<i8>, !llvm.ptr<i8>
+  // CHECK: llvm.intr.vaend %[[CAST1]]
+  // CHECK: llvm.intr.vaend %[[CAST0]]
+  llvm.intr.vaend %5 : !llvm.ptr<i8>
+  llvm.intr.vaend %3 : !llvm.ptr<i8>
+  // CHECK: llvm.return
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index c495a36bc0678..b430c56fe7aa2 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -1,10 +1,10 @@
 // RUN: mlir-opt %s | mlir-opt | FileCheck %s
 
 // CHECK-LABEL: func @ops
-// CHECK-SAME: (%[[I32:.*]]: i32, %[[FLOAT:.*]]: f32, %[[I8PTR1:.*]]: !llvm.ptr<i8>, %[[I8PTR2:.*]]: !llvm.ptr<i8>, %[[BOOL:.*]]: i1, %[[VI8PTR1:.*]]: !llvm.vec<2 x ptr<i8>>)
+// CHECK-SAME: (%[[I32:.*]]: i32, %[[FLOAT:.*]]: f32, %[[PTR1:.*]]: !llvm.ptr, %[[PTR2:.*]]: !llvm.ptr, %[[BOOL:.*]]: i1, %[[VPTR1:.*]]: !llvm.vec<2 x ptr>)
 func.func @ops(%arg0: i32, %arg1: f32,
-          %arg2: !llvm.ptr<i8>, %arg3: !llvm.ptr<i8>,
-          %arg4: i1, %arg5 : !llvm.vec<2x!llvm.ptr<i8>>) {
+          %arg2: !llvm.ptr, %arg3: !llvm.ptr,
+          %arg4: i1, %arg5 : !llvm.vec<2x!llvm.ptr>) {
 // Integer arithmetic binary operations.
 //
 // CHECK: {{.*}} = llvm.add %[[I32]], %[[I32]] : i32
@@ -16,9 +16,9 @@ func.func @ops(%arg0: i32, %arg1: f32,
 // CHECK: {{.*}} = llvm.srem %[[I32]], %[[I32]] : i32
 // CHECK: %[[SCALAR_PRED0:.+]] = llvm.icmp "ne" %[[I32]], %[[I32]] : i32
 // CHECK: {{.*}} = llvm.add %[[SCALAR_PRED0]], %[[SCALAR_PRED0]] : i1
-// CHECK: %[[SCALAR_PRED1:.+]] = llvm.icmp "ne" %[[I8PTR1]], %[[I8PTR1]] : !llvm.ptr<i8>
+// CHECK: %[[SCALAR_PRED1:.+]] = llvm.icmp "ne" %[[PTR1]], %[[PTR1]] : !llvm.ptr
 // CHECK: {{.*}} = llvm.add %[[SCALAR_PRED1]], %[[SCALAR_PRED1]] : i1
-// CHECK: %[[VEC_PRED:.+]] = llvm.icmp "ne" %[[VI8PTR1]], %[[VI8PTR1]] : !llvm.vec<2 x ptr<i8>>
+// CHECK: %[[VEC_PRED:.+]] = llvm.icmp "ne" %[[VPTR1]], %[[VPTR1]] : !llvm.vec<2 x ptr>
 // CHECK: {{.*}} = llvm.add %[[VEC_PRED]], %[[VEC_PRED]] : vector<2xi1>
   %0 = llvm.add %arg0, %arg0 : i32
   %1 = llvm.sub %arg0, %arg0 : i32
@@ -29,9 +29,9 @@ func.func @ops(%arg0: i32, %arg1: f32,
   %6 = llvm.srem %arg0, %arg0 : i32
   %7 = llvm.icmp "ne" %arg0, %arg0 : i32
   %typecheck_7 = llvm.add %7, %7 : i1
-  %ptrcmp = llvm.icmp "ne" %arg2, %arg2 : !llvm.ptr<i8>
+  %ptrcmp = llvm.icmp "ne" %arg2, %arg2 : !llvm.ptr
   %typecheck_ptrcmp = llvm.add %ptrcmp, %ptrcmp : i1
-  %vptrcmp = llvm.icmp "ne" %arg5, %arg5 : !llvm.vec<2 x ptr<i8>>
+  %vptrcmp = llvm.icmp "ne" %arg5, %arg5 : !llvm.vec<2 x ptr>
   %typecheck_vptrcmp = llvm.add %vptrcmp, %vptrcmp : vector<2 x i1>
 
 // Floating point binary operations.
@@ -49,16 +49,14 @@ func.func @ops(%arg0: i32, %arg1: f32,
 
 // Memory-related operations.
 //
-// CHECK-NEXT:  %[[ALLOCA:.*]] = llvm.alloca %[[I32]] x f64 : (i32) -> !llvm.ptr<f64>
-// CHECK-NEXT:  %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][%[[I32]], %[[I32]]] : (!llvm.ptr<f64>, i32, i32) -> !llvm.ptr<f64>
-// CHECK-NEXT:  %[[VALUE:.*]] = llvm.load %[[GEP]] : !llvm.ptr<f64>
-// CHECK-NEXT:  llvm.store %[[VALUE]], %[[ALLOCA]] : !llvm.ptr<f64>
-// CHECK-NEXT:  %{{.*}} = llvm.bitcast %[[ALLOCA]] : !llvm.ptr<f64> to !llvm.ptr<i64>
-  %13 = llvm.alloca %arg0 x f64 : (i32) -> !llvm.ptr<f64>
-  %14 = llvm.getelementptr %13[%arg0, %arg0] : (!llvm.ptr<f64>, i32, i32) -> !llvm.ptr<f64>
-  %15 = llvm.load %14 : !llvm.ptr<f64>
-  llvm.store %15, %13 : !llvm.ptr<f64>
-  %16 = llvm.bitcast %13 : !llvm.ptr<f64> to !llvm.ptr<i64>
+// CHECK-NEXT:  %[[ALLOCA:.*]] = llvm.alloca %[[I32]] x f64 : (i32) -> !llvm.ptr
+// CHECK-NEXT:  %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][%[[I32]], %[[I32]]] : (!llvm.ptr, i32, i32) -> !llvm.ptr, f64
+// CHECK-NEXT:  %[[VALUE:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> f64
+// CHECK-NEXT:  llvm.store %[[VALUE]], %[[ALLOCA]] : f64, !llvm.ptr
+  %13 = llvm.alloca %arg0 x f64 : (i32) -> !llvm.ptr
+  %14 = llvm.getelementptr %13[%arg0, %arg0] : (!llvm.ptr, i32, i32) -> !llvm.ptr, f64
+  %15 = llvm.load %14 : !llvm.ptr -> f64
+  llvm.store %15, %13 : f64, !llvm.ptr
 
 // Function call-related operations.
 //
@@ -130,10 +128,10 @@ func.func @ops(%arg0: i32, %arg1: f32,
 
 // Integer to pointer and pointer to integer conversions.
 //
-// CHECK: %[[PTR:.*]] = llvm.inttoptr %[[I32]] : i32 to !llvm.ptr<i32>
-// CHECK: %{{.*}} = llvm.ptrtoint %[[PTR]] : !llvm.ptr<i32> to i32
-  %25 = llvm.inttoptr %arg0 : i32 to !llvm.ptr<i32>
-  %26 = llvm.ptrtoint %25 : !llvm.ptr<i32> to i32
+// CHECK: %[[PTR:.*]] = llvm.inttoptr %[[I32]] : i32 to !llvm.ptr
+// CHECK: %{{.*}} = llvm.ptrtoint %[[PTR]] : !llvm.ptr to i32
+  %25 = llvm.inttoptr %arg0 : i32 to !llvm.ptr
+  %26 = llvm.ptrtoint %25 : !llvm.ptr to i32
 
 // Extended and Quad floating point
 //
@@ -163,28 +161,27 @@ func.func @ops(%arg0: i32, %arg1: f32,
 // CHECK: llvm.intr.round(%[[FLOAT]]) : (f32) -> f32
   %34 = llvm.intr.round(%arg1) : (f32) -> f32
 
-// CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i32, i1) -> ()
-  "llvm.intr.memcpy"(%arg2, %arg3, %arg0, %arg4) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i32, i1) -> ()
+// CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, i32, i1) -> ()
+  "llvm.intr.memcpy"(%arg2, %arg3, %arg0, %arg4) : (!llvm.ptr, !llvm.ptr, i32, i1) -> ()
 
-// CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i32, i1) -> ()
-  "llvm.intr.memcpy"(%arg2, %arg3, %arg0, %arg4) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i32, i1) -> ()
+// CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, i32, i1) -> ()
+  "llvm.intr.memcpy"(%arg2, %arg3, %arg0, %arg4) : (!llvm.ptr, !llvm.ptr, i32, i1) -> ()
 
 // CHECK: %[[SZ:.*]] = llvm.mlir.constant
   %sz = llvm.mlir.constant(10: i64) : i64
-// CHECK: "llvm.intr.memcpy.inline"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i64, i1) -> ()
-  "llvm.intr.memcpy.inline"(%arg2, %arg3, %sz, %arg4) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i64, i1) -> ()
+// CHECK: "llvm.intr.memcpy.inline"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, i64, i1) -> ()
+  "llvm.intr.memcpy.inline"(%arg2, %arg3, %sz, %arg4) : (!llvm.ptr, !llvm.ptr, i64, i1) -> ()
 
 // CHECK:  llvm.return
   llvm.return
 }
 
 // CHECK-LABEL: @gep
-llvm.func @gep(%ptr: !llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, %idx: i64,
-               %ptr2: !llvm.ptr<struct<(array<10 x f32>)>>) {
-  // CHECK: llvm.getelementptr %{{.*}}[%{{.*}}, 1, 0] : (!llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, i64) -> !llvm.ptr<i32>
-  llvm.getelementptr %ptr[%idx, 1, 0] : (!llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, i64) -> !llvm.ptr<i32>
-  // CHECK: llvm.getelementptr inbounds %{{.*}}[%{{.*}}, 0, %{{.*}}] : (!llvm.ptr<struct<(array<10 x f32>)>>, i64, i64) -> !llvm.ptr<f32>
-  llvm.getelementptr inbounds %ptr2[%idx, 0, %idx] : (!llvm.ptr<struct<(array<10 x f32>)>>, i64, i64) -> !llvm.ptr<f32>
+llvm.func @gep(%ptr: !llvm.ptr, %idx: i64, %ptr2: !llvm.ptr) {
+  // CHECK: llvm.getelementptr %{{.*}}[%{{.*}}, 1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(i32, struct<(i32, f32)>)>
+  llvm.getelementptr %ptr[%idx, 1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(i32, struct<(i32, f32)>)>
+  // CHECK: llvm.getelementptr inbounds %{{.*}}[%{{.*}}, 0, %{{.*}}] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.struct<(array<10 x f32>)>
+  llvm.getelementptr inbounds %ptr2[%idx, 0, %idx] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.struct<(array<10 x f32>)>
   llvm.return
 }
 
@@ -248,9 +245,9 @@ llvm.func @foo(%arg0: i32) -> !llvm.struct<(i32, f64, i32)> {
 }
 
 // CHECK-LABEL: @casts
-// CHECK-SAME: (%[[I32:.*]]: i32, %[[I64:.*]]: i64, %[[V4I32:.*]]: vector<4xi32>, %[[V4I64:.*]]: vector<4xi64>, %[[I32PTR:.*]]: !llvm.ptr<i32>)
+// CHECK-SAME: (%[[I32:.*]]: i32, %[[I64:.*]]: i64, %[[V4I32:.*]]: vector<4xi32>, %[[V4I64:.*]]: vector<4xi64>, %[[PTR:.*]]: !llvm.ptr)
 func.func @casts(%arg0: i32, %arg1: i64, %arg2: vector<4xi32>,
-            %arg3: vector<4xi64>, %arg4: !llvm.ptr<i32>) {
+            %arg3: vector<4xi64>, %arg4: !llvm.ptr) {
 // CHECK:  = llvm.sext %[[I32]] : i32 to i56
   %0 = llvm.sext %arg0 : i32 to i56
 // CHECK:  = llvm.zext %[[I32]] : i32 to i64
@@ -271,21 +268,23 @@ func.func @casts(%arg0: i32, %arg1: i64, %arg2: vector<4xi32>,
   %8 = llvm.fptosi %7 : f32 to i32
 // CHECK:  = llvm.fptoui %[[FLOAT]] : f32 to i32
   %9 = llvm.fptoui %7 : f32 to i32
-// CHECK:  = llvm.addrspacecast %[[I32PTR]] : !llvm.ptr<i32> to !llvm.ptr<i32, 2>
-  %10 = llvm.addrspacecast %arg4 : !llvm.ptr<i32> to !llvm.ptr<i32, 2>
+// CHECK:  = llvm.addrspacecast %[[PTR]] : !llvm.ptr to !llvm.ptr<2>
+  %10 = llvm.addrspacecast %arg4 : !llvm.ptr to !llvm.ptr<2>
+// CHECK:  = llvm.bitcast %[[I64]] : i64 to f64
+  %11 = llvm.bitcast %arg1 : i64 to f64
   llvm.return
 }
 
 // CHECK-LABEL: @vect
-func.func @vect(%arg0: vector<4xf32>, %arg1: i32, %arg2: f32, %arg3: !llvm.vec<2 x ptr<i32>>) {
+func.func @vect(%arg0: vector<4xf32>, %arg1: i32, %arg2: f32, %arg3: !llvm.vec<2 x ptr>) {
 // CHECK:  = llvm.extractelement {{.*}} : vector<4xf32>
   %0 = llvm.extractelement %arg0[%arg1 : i32] : vector<4xf32>
 // CHECK:  = llvm.insertelement {{.*}} : vector<4xf32>
   %1 = llvm.insertelement %arg2, %arg0[%arg1 : i32] : vector<4xf32>
 // CHECK:  = llvm.shufflevector {{.*}} [0, 0, 0, 0, 7] : vector<4xf32>
   %2 = llvm.shufflevector %arg0, %arg0 [0, 0, 0, 0, 7] : vector<4xf32>
-// CHECK:  = llvm.shufflevector %{{.+}}, %{{.+}} [1, 0] : !llvm.vec<2 x ptr<i32>>
-  %3 = llvm.shufflevector %arg3, %arg3 [1, 0] : !llvm.vec<2 x ptr<i32>>
+// CHECK:  = llvm.shufflevector %{{.+}}, %{{.+}} [1, 0] : !llvm.vec<2 x ptr>
+  %3 = llvm.shufflevector %arg3, %arg3 [1, 0] : !llvm.vec<2 x ptr>
 // CHECK:  = llvm.mlir.constant(dense<1.000000e+00> : vector<4xf32>) : vector<4xf32>
   %4 = llvm.mlir.constant(dense<1.0> : vector<4xf32>) : vector<4xf32>
   return
@@ -323,19 +322,17 @@ func.func @mixed_vect(%arg0: vector<8xf32>, %arg1: vector<4xf32>, %arg2: vector<
 
 // CHECK-LABEL: @alloca
 func.func @alloca(%size : i64) {
-  // CHECK: llvm.alloca %{{.*}} x i32 : (i64) -> !llvm.ptr<i32>
-  llvm.alloca %size x i32 {alignment = 0} : (i64) -> (!llvm.ptr<i32>)
-  // CHECK: llvm.alloca inalloca %{{.*}} x i32 {alignment = 8 : i64} : (i64) -> !llvm.ptr<i32>
-  llvm.alloca inalloca %size x i32 {alignment = 8} : (i64) -> (!llvm.ptr<i32>)
+  // CHECK: llvm.alloca %{{.*}} x i32 : (i64) -> !llvm.ptr
+  llvm.alloca %size x i32 {alignment = 0} : (i64) -> (!llvm.ptr)
+  // CHECK: llvm.alloca inalloca %{{.*}} x i32 {alignment = 8 : i64} : (i64) -> !llvm.ptr
+  llvm.alloca inalloca %size x i32 {alignment = 8} : (i64) -> (!llvm.ptr)
   llvm.return
 }
 
 // CHECK-LABEL: @null
 func.func @null() {
-  // CHECK: llvm.mlir.null : !llvm.ptr<i8>
-  %0 = llvm.mlir.null : !llvm.ptr<i8>
-  // CHECK: llvm.mlir.null : !llvm.ptr<struct<(ptr<func<void (i32, ptr<func<void ()>>)>>, i64)>>
-  %1 = llvm.mlir.null : !llvm.ptr<struct<(ptr<func<void (i32, ptr<func<void ()>>)>>, i64)>>
+  // CHECK: llvm.mlir.null : !llvm.ptr
+  %0 = llvm.mlir.null : !llvm.ptr
   llvm.return
 }
 
@@ -375,61 +372,57 @@ func.func @cmpxchg(%ptr : !llvm.ptr, %cmp : i32, %new : i32) {
   llvm.return
 }
 
-llvm.mlir.global external constant @_ZTIi() : !llvm.ptr<i8>
-llvm.func @bar(!llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>)
+llvm.mlir.global external constant @_ZTIi() : !llvm.ptr
+llvm.func @bar(!llvm.ptr, !llvm.ptr, !llvm.ptr)
 llvm.func @__gxx_personality_v0(...) -> i32
 
 // CHECK-LABEL: @invokeLandingpad
 llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personality_v0 } {
-// CHECK: %[[a0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[V0:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK: %{{.*}} = llvm.mlir.constant(3 : i32) : i32
-// CHECK: %[[a2:.*]] = llvm.mlir.constant("\01") : !llvm.array<1 x i8>
-// CHECK: %[[a3:.*]] = llvm.mlir.null : !llvm.ptr<ptr<i8>>
-// CHECK: %[[a4:.*]] = llvm.mlir.null : !llvm.ptr<i8>
-// CHECK: %[[a5:.*]] = llvm.mlir.addressof @_ZTIi : !llvm.ptr<ptr<i8>>
-// CHECK: %[[a6:.*]] = llvm.bitcast %[[a5]] : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
-// CHECK: %[[a7:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK: %[[a8:.*]] = llvm.alloca %[[a7]] x i8 : (i32) -> !llvm.ptr<i8>
-// CHECK: %{{.*}} = llvm.invoke @foo(%[[a7]]) to ^[[BB2:.*]] unwind ^[[BB1:.*]] : (i32) -> !llvm.struct<(i32, f64, i32)>
+// CHECK: %[[V1:.*]] = llvm.mlir.constant("\01") : !llvm.array<1 x i8>
+// CHECK: %[[V2:.*]] = llvm.mlir.null : !llvm.ptr
+// CHECK: %[[V3:.*]] = llvm.mlir.addressof @_ZTIi : !llvm.ptr
+// CHECK: %[[V4:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[V5:.*]] = llvm.alloca %[[V4]] x i8 : (i32) -> !llvm.ptr
+// CHECK: %{{.*}} = llvm.invoke @foo(%[[V4]]) to ^[[BB2:.*]] unwind ^[[BB1:.*]] : (i32) -> !llvm.struct<(i32, f64, i32)>
   %0 = llvm.mlir.constant(0 : i32) : i32
   %1 = llvm.mlir.constant(3 : i32) : i32
   %2 = llvm.mlir.constant("\01") : !llvm.array<1 x i8>
-  %3 = llvm.mlir.null : !llvm.ptr<ptr<i8>>
-  %4 = llvm.mlir.null : !llvm.ptr<i8>
-  %5 = llvm.mlir.addressof @_ZTIi : !llvm.ptr<ptr<i8>>
-  %6 = llvm.bitcast %5 : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
-  %7 = llvm.mlir.constant(1 : i32) : i32
-  %8 = llvm.alloca %7 x i8 : (i32) -> !llvm.ptr<i8>
-  %9 = llvm.invoke @foo(%7) to ^bb2 unwind ^bb1 : (i32) -> !llvm.struct<(i32, f64, i32)>
+  %3 = llvm.mlir.null : !llvm.ptr
+  %4 = llvm.mlir.addressof @_ZTIi : !llvm.ptr
+  %5 = llvm.mlir.constant(1 : i32) : i32
+  %6 = llvm.alloca %5 x i8 : (i32) -> !llvm.ptr
+  %7 = llvm.invoke @foo(%5) to ^bb2 unwind ^bb1 : (i32) -> !llvm.struct<(i32, f64, i32)>
 
 // CHECK: ^[[BB1]]:
-// CHECK:   %[[lp:.*]] = llvm.landingpad cleanup (catch %[[a3]] : !llvm.ptr<ptr<i8>>) (catch %[[a6]] : !llvm.ptr<i8>) (filter %[[a2]] : !llvm.array<1 x i8>) : !llvm.struct<(ptr<i8>, i32)>
-// CHECK:   %{{.*}} = llvm.intr.eh.typeid.for %6 : (!llvm.ptr<i8>) -> i32
-// CHECK:   llvm.resume %[[lp]] : !llvm.struct<(ptr<i8>, i32)>
+// CHECK:   %[[lp:.*]] = llvm.landingpad cleanup (catch %[[V2]] : !llvm.ptr) (catch %[[V3]] : !llvm.ptr) (filter %[[V1]] : !llvm.array<1 x i8>) : !llvm.struct<(ptr, i32)>
+// CHECK:   %{{.*}} = llvm.intr.eh.typeid.for %[[V3]] : (!llvm.ptr) -> i32
+// CHECK:   llvm.resume %[[lp]] : !llvm.struct<(ptr, i32)>
 ^bb1:
-  %10 = llvm.landingpad cleanup (catch %3 : !llvm.ptr<ptr<i8>>) (catch %6 : !llvm.ptr<i8>) (filter %2 : !llvm.array<1 x i8>) : !llvm.struct<(ptr<i8>, i32)>
-  %11 = llvm.intr.eh.typeid.for %6 : (!llvm.ptr<i8>) -> i32
-  llvm.resume %10 : !llvm.struct<(ptr<i8>, i32)>
+  %10 = llvm.landingpad cleanup (catch %3 : !llvm.ptr) (catch %4 : !llvm.ptr) (filter %2 : !llvm.array<1 x i8>) : !llvm.struct<(ptr, i32)>
+  %11 = llvm.intr.eh.typeid.for %4 : (!llvm.ptr) -> i32
+  llvm.resume %10 : !llvm.struct<(ptr, i32)>
 
 // CHECK: ^[[BB2]]:
-// CHECK:   llvm.return %[[a7]] : i32
+// CHECK:   llvm.return %[[V4]] : i32
 ^bb2:
-  llvm.return %7 : i32
+  llvm.return %5 : i32
 
 // CHECK: ^[[BB3:.*]]:
-// CHECK:   llvm.invoke @bar(%[[a8]], %[[a6]], %[[a4]]) to ^[[BB2]] unwind ^[[BB1]] : (!llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+// CHECK:   llvm.invoke @bar(%[[V5]], %[[V3]], %[[V2]]) to ^[[BB2]] unwind ^[[BB1]] : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
 ^bb3:
-  llvm.invoke @bar(%8, %6, %4) to ^bb2 unwind ^bb1 : (!llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+  llvm.invoke @bar(%6, %4, %3) to ^bb2 unwind ^bb1 : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
 
 // CHECK: ^[[BB4:.*]]:
 // CHECK: %[[FUNC:.*]] = llvm.mlir.addressof @foo : !llvm.ptr
 // CHECK: %{{.*}} = llvm.invoke %[[FUNC]]{{.*}}: !llvm.ptr,
 ^bb4:
   %12 = llvm.mlir.addressof @foo : !llvm.ptr
-  %13 = llvm.invoke %12(%7) to ^bb2 unwind ^bb1 : !llvm.ptr, (i32) -> !llvm.struct<(i32, f64, i32)>
+  %13 = llvm.invoke %12(%5) to ^bb2 unwind ^bb1 : !llvm.ptr, (i32) -> !llvm.struct<(i32, f64, i32)>
 
 // CHECK: ^[[BB5:.*]]:
-// CHECK:   llvm.return %[[a0]] : i32
+// CHECK:   llvm.return %[[V0]] : i32
 ^bb5:
   llvm.return %0 : i32
 }
@@ -528,32 +521,6 @@ func.func @fastmathFlags(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: vector<2 x f
   return
 }
 
-// CHECK-LABEL: llvm.func @vararg_func
-llvm.func @vararg_func(%arg0: i32, ...) {
-  // CHECK: %{{.*}} = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %{{.*}} = llvm.mlir.constant(1 : i32) : i32
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA0:.+]] = llvm.alloca %{{.*}} x !llvm.struct<"struct.va_list", (ptr<i8>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>>
-  // CHECK: %[[CAST0:.+]] = llvm.bitcast %[[ALLOCA0]] : !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>> to !llvm.ptr<i8>
-  %2 = llvm.alloca %1 x !llvm.struct<"struct.va_list", (ptr<i8>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>>
-  %3 = llvm.bitcast %2 : !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>> to !llvm.ptr<i8>
-  // CHECK: llvm.intr.vastart %[[CAST0]]
-  llvm.intr.vastart %3 : !llvm.ptr<i8>
-  // CHECK: %[[ALLOCA1:.+]] = llvm.alloca %{{.*}} x !llvm.ptr<i8> {alignment = 8 : i64} : (i32) -> !llvm.ptr<ptr<i8>>
-  // CHECK: %[[CAST1:.+]] = llvm.bitcast %[[ALLOCA1]] : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
-  %4 = llvm.alloca %0 x !llvm.ptr<i8> {alignment = 8 : i64} : (i32) -> !llvm.ptr<ptr<i8>>
-  %5 = llvm.bitcast %4 : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
-  // CHECK: llvm.intr.vacopy %[[CAST0]] to %[[CAST1]]
-  llvm.intr.vacopy %3 to %5 : !llvm.ptr<i8>, !llvm.ptr<i8>
-  // CHECK: llvm.intr.vaend %[[CAST1]]
-  // CHECK: llvm.intr.vaend %[[CAST0]]
-  llvm.intr.vaend %5 : !llvm.ptr<i8>
-  llvm.intr.vaend %3 : !llvm.ptr<i8>
-  // CHECK: llvm.return
-  llvm.return
-}
-
 // CHECK-LABEL: @lifetime
 // CHECK-SAME: %[[P:.*]]: !llvm.ptr
 llvm.func @lifetime(%p: !llvm.ptr) {
@@ -564,8 +531,8 @@ llvm.func @lifetime(%p: !llvm.ptr) {
   llvm.return
 }
 
-// CHECK-LABEL: @vararg_func_opaque_pointers
-llvm.func @vararg_func_opaque_pointers(%arg0: i32, ...) {
+// CHECK-LABEL: @vararg_func
+llvm.func @vararg_func(%arg0: i32, ...) {
   // CHECK: %[[C:.*]] = llvm.mlir.constant(1 : i32)
   // CHECK: %[[LIST:.*]] = llvm.alloca
   // CHECK: llvm.intr.vastart %[[LIST]] : !llvm.ptr{{$}}
@@ -585,17 +552,17 @@ llvm.func @vararg_func_opaque_pointers(%arg0: i32, ...) {
   llvm.return
 }
 
-// CHECK-LABEL: @eh_typeid_opaque_pointers
+// CHECK-LABEL: @eh_typeid
 // CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr
-llvm.func @eh_typeid_opaque_pointers(%arg0: !llvm.ptr) -> i32 {
+llvm.func @eh_typeid(%arg0: !llvm.ptr) -> i32 {
   // CHECK: llvm.intr.eh.typeid.for %[[ARG0]] : (!llvm.ptr) -> i32
   %0 = llvm.intr.eh.typeid.for %arg0 : (!llvm.ptr) -> i32
   llvm.return %0 : i32
 }
 
-// CHECK-LABEL: @stackrestore_opaque_pointers
+// CHECK-LABEL: @stackrestore
 // CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr
-llvm.func @stackrestore_opaque_pointers(%arg0: !llvm.ptr)  {
+llvm.func @stackrestore(%arg0: !llvm.ptr)  {
   // CHECK: llvm.intr.stackrestore %[[ARG0]] : !llvm.ptr
   llvm.intr.stackrestore %arg0 : !llvm.ptr
   llvm.return

From 467cf1542808851773500fe0af0da916f46fa80c Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcong.ca.cai@bmw.com>
Date: Thu, 23 Mar 2023 15:40:55 +0800
Subject: [PATCH 075/208] [NFC] Fix typo lld::wasm in comment

---
 lld/wasm/Driver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index e697f4b55ae6f..310f9df2d5b68 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -1217,4 +1217,4 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   writeResult();
 }
 
-} // namespace wasm::lld
+} // namespace lld::wasm

From 5acd29eb4d9e411b3631c26babcd1d2655623f4a Mon Sep 17 00:00:00 2001
From: Martin Braenne <mboehme@google.com>
Date: Thu, 23 Mar 2023 07:45:40 +0000
Subject: [PATCH 076/208] [clang][dataflow] Fix crash when RHS of `&&` or `||`
 calls `noreturn` func.

The crash happened because the transfer fucntion for `&&` and `||`
unconditionally tried to retrieve the value of the RHS. However, if the RHS
is unreachable, there is no environment for it, and trying to retrieve the
operand's value causes an assertion failure.

See also the comments in the code for further details.

Reviewed By: xazax.hun, ymandel, sgatev, gribozavr2

Differential Revision: https://reviews.llvm.org/D146514
---
 .../FlowSensitive/ControlFlowContext.h        | 13 +++-
 .../clang/Analysis/FlowSensitive/Transfer.h   |  6 +-
 .../FlowSensitive/ControlFlowContext.cpp      | 29 +++++++-
 clang/lib/Analysis/FlowSensitive/Transfer.cpp | 42 ++++++++----
 .../TypeErasedDataflowAnalysis.cpp            |  2 +
 .../Analysis/FlowSensitive/TestingSupport.h   | 14 ++++
 .../Analysis/FlowSensitive/TransferTest.cpp   | 66 +++++++++++++++++++
 7 files changed, 153 insertions(+), 19 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
index e641468f77d00..3495bdfc538cb 100644
--- a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
@@ -18,6 +18,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
 #include <memory>
@@ -47,18 +48,26 @@ class ControlFlowContext {
     return StmtToBlock;
   }
 
+  /// Returns whether `B` is reachable from the entry block.
+  bool isBlockReachable(const CFGBlock &B) const {
+    return BlockReachable[B.getBlockID()];
+  }
+
 private:
   // FIXME: Once the deprecated `build` method is removed, mark `D` as "must not
   // be null" and add an assertion.
   ControlFlowContext(const Decl *D, std::unique_ptr<CFG> Cfg,
-                     llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock)
+                     llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock,
+                     llvm::BitVector BlockReachable)
       : ContainingDecl(D), Cfg(std::move(Cfg)),
-        StmtToBlock(std::move(StmtToBlock)) {}
+        StmtToBlock(std::move(StmtToBlock)),
+        BlockReachable(std::move(BlockReachable)) {}
 
   /// The `Decl` containing the statement used to construct the CFG.
   const Decl *ContainingDecl;
   std::unique_ptr<CFG> Cfg;
   llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+  llvm::BitVector BlockReachable;
 };
 
 } // namespace dataflow
diff --git a/clang/include/clang/Analysis/FlowSensitive/Transfer.h b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
index 78a426ed94dd5..db3d780bf35e5 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Transfer.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
@@ -26,9 +26,9 @@ class StmtToEnvMap {
 public:
   virtual ~StmtToEnvMap() = default;
 
-  /// Returns the environment of the basic block that contains `S` or nullptr if
-  /// there isn't one.
-  /// FIXME: Ensure that the result can't be null and return a const reference.
+  /// Retrieves the environment of the basic block that contains `S`.
+  /// If `S` is reachable, returns a non-null pointer to the environment.
+  /// If `S` is not reachable, returns nullptr.
   virtual const Environment *getEnvironment(const Stmt &S) const = 0;
 };
 
diff --git a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
index 2492b5203724c..6699a0fc9d79e 100644
--- a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
 #include <utility>
@@ -44,6 +45,28 @@ buildStmtToBasicBlockMap(const CFG &Cfg) {
   return StmtToBlock;
 }
 
+static llvm::BitVector findReachableBlocks(const CFG &Cfg) {
+  llvm::BitVector BlockReachable(Cfg.getNumBlockIDs(), false);
+
+  llvm::SmallVector<const CFGBlock *> BlocksToVisit;
+  BlocksToVisit.push_back(&Cfg.getEntry());
+  while (!BlocksToVisit.empty()) {
+    const CFGBlock *Block = BlocksToVisit.back();
+    BlocksToVisit.pop_back();
+
+    if (BlockReachable[Block->getBlockID()])
+      continue;
+
+    BlockReachable[Block->getBlockID()] = true;
+
+    for (const CFGBlock *Succ : Block->succs())
+      if (Succ)
+        BlocksToVisit.push_back(Succ);
+  }
+
+  return BlockReachable;
+}
+
 llvm::Expected<ControlFlowContext>
 ControlFlowContext::build(const Decl *D, Stmt &S, ASTContext &C) {
   CFG::BuildOptions Options;
@@ -64,7 +87,11 @@ ControlFlowContext::build(const Decl *D, Stmt &S, ASTContext &C) {
 
   llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock =
       buildStmtToBasicBlockMap(*Cfg);
-  return ControlFlowContext(D, std::move(Cfg), std::move(StmtToBlock));
+
+  llvm::BitVector BlockReachable = findReachableBlocks(*Cfg);
+
+  return ControlFlowContext(D, std::move(Cfg), std::move(StmtToBlock),
+                            std::move(BlockReachable));
 }
 
 } // namespace dataflow
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index e427f1458a8db..a1ed37da54c28 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -162,15 +162,27 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     }
     case BO_LAnd:
     case BO_LOr: {
-      BoolValue &LHSVal = getLogicOperatorSubExprValue(*LHS);
-      BoolValue &RHSVal = getLogicOperatorSubExprValue(*RHS);
-
       auto &Loc = Env.createStorageLocation(*S);
       Env.setStorageLocation(*S, Loc);
+
+      BoolValue *LHSVal = getLogicOperatorSubExprValue(*LHS);
+      // If the LHS was not reachable, this BinaryOperator would also not be
+      // reachable, and we would never get here.
+      assert(LHSVal != nullptr);
+      BoolValue *RHSVal = getLogicOperatorSubExprValue(*RHS);
+      if (RHSVal == nullptr) {
+        // If the RHS isn't reachable and we evaluate this BinaryOperator,
+        // then the value of the LHS must have triggered the short-circuit
+        // logic. This implies that the value of the entire expression must be
+        // equal to the value of the LHS.
+        Env.setValue(Loc, *LHSVal);
+        break;
+      }
+
       if (S->getOpcode() == BO_LAnd)
-        Env.setValue(Loc, Env.makeAnd(LHSVal, RHSVal));
+        Env.setValue(Loc, Env.makeAnd(*LHSVal, *RHSVal));
       else
-        Env.setValue(Loc, Env.makeOr(LHSVal, RHSVal));
+        Env.setValue(Loc, Env.makeOr(*LHSVal, *RHSVal));
       break;
     }
     case BO_NE:
@@ -779,15 +791,19 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
   }
 
 private:
-  BoolValue &getLogicOperatorSubExprValue(const Expr &SubExpr) {
+  /// If `SubExpr` is reachable, returns a non-null pointer to the value for
+  /// `SubExpr`. If `SubExpr` is not reachable, returns nullptr.
+  BoolValue *getLogicOperatorSubExprValue(const Expr &SubExpr) {
     // `SubExpr` and its parent logic operator might be part of different basic
     // blocks. We try to access the value that is assigned to `SubExpr` in the
     // corresponding environment.
-    if (const Environment *SubExprEnv = StmtToEnv.getEnvironment(SubExpr)) {
-      if (auto *Val = dyn_cast_or_null<BoolValue>(
-              SubExprEnv->getValue(SubExpr, SkipPast::Reference)))
-        return *Val;
-    }
+    const Environment *SubExprEnv = StmtToEnv.getEnvironment(SubExpr);
+    if (!SubExprEnv)
+      return nullptr;
+
+    if (auto *Val = dyn_cast_or_null<BoolValue>(
+            SubExprEnv->getValue(SubExpr, SkipPast::Reference)))
+      return Val;
 
     if (Env.getStorageLocation(SubExpr, SkipPast::None) == nullptr) {
       // Sub-expressions that are logic operators are not added in basic blocks
@@ -800,11 +816,11 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
 
     if (auto *Val = dyn_cast_or_null<BoolValue>(
             Env.getValue(SubExpr, SkipPast::Reference)))
-      return *Val;
+      return Val;
 
     // If the value of `SubExpr` is still unknown, we create a fresh symbolic
     // boolean value for it.
-    return Env.makeAtomicBoolValue();
+    return &Env.makeAtomicBoolValue();
   }
 
   // If context sensitivity is enabled, try to analyze the body of the callee
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index fe00d765b6bef..d94b547ca17de 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -51,6 +51,8 @@ class StmtToEnvMapImpl : public StmtToEnvMap {
   const Environment *getEnvironment(const Stmt &S) const override {
     auto BlockIt = CFCtx.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
     assert(BlockIt != CFCtx.getStmtToBlock().end());
+    if (!CFCtx.isBlockReachable(*BlockIt->getSecond()))
+      return nullptr;
     const auto &State = BlockToState[BlockIt->getSecond()->getBlockID()];
     assert(State);
     return &State->Env;
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
index bc089f141850a..ef67dc98790c0 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
@@ -389,6 +389,20 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
 ///   `Name` must be unique in `ASTCtx`.
 const ValueDecl *findValueDecl(ASTContext &ASTCtx, llvm::StringRef Name);
 
+/// Returns the value (of type `ValueT`) for the given identifier.
+/// `ValueT` must be a subclass of `Value` and must be of the appropriate type.
+///
+/// Requirements:
+///
+///   `Name` must be unique in `ASTCtx`.
+template <class ValueT>
+ValueT &getValueForDecl(ASTContext &ASTCtx, const Environment &Env,
+                        llvm::StringRef Name) {
+  const ValueDecl *VD = findValueDecl(ASTCtx, Name);
+  assert(VD != nullptr);
+  return *cast<ValueT>(Env.getValue(*VD, SkipPast::None));
+}
+
 /// Creates and owns constraints which are boolean values.
 class ConstraintContext {
 public:
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 9c16335714c55..1bb772a93bda6 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -5104,4 +5104,70 @@ TEST(TransferTest, UnnamedBitfieldInitializer) {
       });
 }
 
+// Repro for a crash that used to occur when we call a `noreturn` function
+// within one of the operands of a `&&` or `||` operator.
+TEST(TransferTest, NoReturnFunctionInsideShortCircuitedBooleanOp) {
+  std::string Code = R"(
+    __attribute__((noreturn)) int doesnt_return();
+    bool some_condition();
+    void target(bool b1, bool b2) {
+      // Neither of these should crash. In addition, if we don't terminate the
+      // program, we know that the operators need to trigger the short-circuit
+      // logic, so `NoreturnOnRhsOfAnd` will be false and `NoreturnOnRhsOfOr`
+      // will be true.
+      bool NoreturnOnRhsOfAnd = b1 && doesnt_return() > 0;
+      bool NoreturnOnRhsOfOr = b2 || doesnt_return() > 0;
+
+      // Calling a `noreturn` function on the LHS of an `&&` or `||` makes the
+      // entire expression unreachable. So we know that in both of the following
+      // cases, if `target()` terminates, the `else` branch was taken.
+      bool NoreturnOnLhsMakesAndUnreachable = false;
+      if (some_condition())
+         doesnt_return() > 0 && some_condition();
+      else
+         NoreturnOnLhsMakesAndUnreachable = true;
+
+      bool NoreturnOnLhsMakesOrUnreachable = false;
+      if (some_condition())
+         doesnt_return() > 0 || some_condition();
+      else
+         NoreturnOnLhsMakesOrUnreachable = true;
+
+      // [[p]]
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        // Check that [[p]] is reachable with a non-false flow condition.
+        EXPECT_FALSE(Env.flowConditionImplies(Env.getBoolLiteralValue(false)));
+
+        auto &B1 = getValueForDecl<BoolValue>(ASTCtx, Env, "b1");
+        EXPECT_TRUE(Env.flowConditionImplies(Env.makeNot(B1)));
+
+        auto &NoreturnOnRhsOfAnd =
+            getValueForDecl<BoolValue>(ASTCtx, Env, "NoreturnOnRhsOfAnd");
+        EXPECT_TRUE(Env.flowConditionImplies(Env.makeNot(NoreturnOnRhsOfAnd)));
+
+        auto &B2 = getValueForDecl<BoolValue>(ASTCtx, Env, "b2");
+        EXPECT_TRUE(Env.flowConditionImplies(B2));
+
+        auto &NoreturnOnRhsOfOr =
+            getValueForDecl<BoolValue>(ASTCtx, Env, "NoreturnOnRhsOfOr");
+        EXPECT_TRUE(Env.flowConditionImplies(NoreturnOnRhsOfOr));
+
+        auto &NoreturnOnLhsMakesAndUnreachable = getValueForDecl<BoolValue>(
+            ASTCtx, Env, "NoreturnOnLhsMakesAndUnreachable");
+        EXPECT_TRUE(Env.flowConditionImplies(NoreturnOnLhsMakesAndUnreachable));
+
+        auto &NoreturnOnLhsMakesOrUnreachable = getValueForDecl<BoolValue>(
+            ASTCtx, Env, "NoreturnOnLhsMakesOrUnreachable");
+        EXPECT_TRUE(Env.flowConditionImplies(NoreturnOnLhsMakesOrUnreachable));
+      });
+}
+
 } // namespace

From b08d35f826a6b7696a02f1d811da7a2f951e74a1 Mon Sep 17 00:00:00 2001
From: Carlos Galvez <carlosgalvezp@gmail.com>
Date: Wed, 22 Mar 2023 18:47:48 +0000
Subject: [PATCH 077/208] [clang-tidy] Ignore DISABLED_ in test suite name in
 google-avoid-underscore-in-googletest-name

Test suite name can also be disabled with DISABLED_, not just
the test case name.

Fix also broken link in the test that refers to the explanation
as to why underscores may not be used.

Differential Revision: https://reviews.llvm.org/D146655
---
 .../google/AvoidUnderscoreInGoogletestNameCheck.cpp  |  6 ++++--
 clang-tools-extra/docs/ReleaseNotes.rst              |  4 ++++
 .../google/avoid-underscore-in-googletest-name.cpp   | 12 +++++++++++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
index c5bd6055072aa..b903f2552b7e6 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
@@ -51,8 +51,10 @@ class AvoidUnderscoreInGoogletestNameCallback : public PPCallbacks {
     const Token *TestNameToken = Args->getUnexpArgument(1);
     if (!TestCaseNameToken || !TestNameToken)
       return;
-    std::string TestCaseName = PP->getSpelling(*TestCaseNameToken);
-    if (TestCaseName.find('_') != std::string::npos)
+    std::string TestCaseNameMaybeDisabled = PP->getSpelling(*TestCaseNameToken);
+    StringRef TestCaseName = TestCaseNameMaybeDisabled;
+    TestCaseName.consume_front(KDisabledTestPrefix);
+    if (TestCaseName.contains('_'))
       Check->diag(TestCaseNameToken->getLocation(),
                   "avoid using \"_\" in test case name \"%0\" according to "
                   "Googletest FAQ")
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 3f79e8e2a187a..80f5b46681713 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -234,6 +234,10 @@ Changes in existing checks
   string for ``Prefix`` or ``Suffix`` options could result in the style not
   being used.
 
+- Fixed an issue in :doc:`google-avoid-underscore-in-googletest-name
+  <clang-tidy/checks/google/avoid-underscore-in-googletest-name>` when using
+  ``DISABLED_`` in the test suite name.
+
 Removed checks
 ^^^^^^^^^^^^^^
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp
index 6e8a5c2d50af9..3ab5a6ffe383b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp
@@ -87,21 +87,31 @@ TYPED_TEST_P(Illegal_Type_ParameterizedTestCaseName, TestName) {}
 // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: avoid using "_" in test case name "Illegal_Type_ParameterizedTestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
 
 // Underscores are allowed to disable a test with the DISABLED_ prefix.
-// https://github.com/google/googletest/blob/master/googletest/docs/faq.md#why-should-test-suite-names-and-test-names-not-contain-underscore
+// https://google.github.io/googletest/faq.html#why-should-test-suite-names-and-test-names-not-contain-underscore
 TEST(TestCaseName, TestName) {}
 TEST(TestCaseName, DISABLED_TestName) {}
+TEST(DISABLED_TestCaseName, TestName) {}
+TEST(DISABLED_TestCaseName, DISABLED_TestName) {}
 
 TEST_F(TestCaseFixtureName, TestName) {}
 TEST_F(TestCaseFixtureName, DISABLED_TestName) {}
+TEST_F(DISABLED_TestCaseFixtureName, TestName) {}
+TEST_F(DISABLED_TestCaseFixtureName, DISABLED_TestName) {}
 
 TEST_P(ParameterizedTestCaseFixtureName, TestName) {}
 TEST_P(ParameterizedTestCaseFixtureName, DISABLED_TestName) {}
+TEST_P(DISABLED_ParameterizedTestCaseFixtureName, TestName) {}
+TEST_P(DISABLED_ParameterizedTestCaseFixtureName, DISABLED_TestName) {}
 
 TYPED_TEST(TypedTestName, TestName) {}
 TYPED_TEST(TypedTestName, DISABLED_TestName) {}
+TYPED_TEST(DISABLED_TypedTestName, TestName) {}
+TYPED_TEST(DISABLED_TypedTestName, DISABLED_TestName) {}
 
 TYPED_TEST_P(TypeParameterizedTestName, TestName) {}
 TYPED_TEST_P(TypeParameterizedTestName, DISABLED_TestName) {}
+TYPED_TEST_P(DISABLED_TypeParameterizedTestName, TestName) {}
+TYPED_TEST_P(DISABLED_TypeParameterizedTestName, DISABLED_TestName) {}
 
 FRIEND_TEST(FriendTest, Is_NotChecked) {}
 FRIEND_TEST(Friend_Test, IsNotChecked) {}

From 5b0055a4ae8d27bf2a8db903eed22ff642fc27c3 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 23 Mar 2023 09:25:01 +0100
Subject: [PATCH 078/208] [mlir][Analysis][NFC] Split
 FlatAffineValueConstraints into multiple classes

The new class hierarchy is as follows:

* `IntegerRelation` (no change)
* `IntegerPolyhedron` (no change)
* `FlatLinearConstraints`: provides an AffineExpr-based API
* `FlatLinearValueConstraints`: stores an additional mapping of non-local vars to SSA values
* `FlatAffineValueConstraints`: provides additional helper functions for Affine dialect ops
* `FlatAffineRelation` (no change)

`FlatConstraints` and `FlatValueConstraints` are moved from `MLIRAffineAnalysis` to `MLIRAnalysis` and can be used without depending on the Affine dialect.

This change is in preparation of D145681, which adds an MLIR interface that depends on `FlatConstraints` (and cannot depend on the Affine dialect or any other dialect).

Differential Revision: https://reviews.llvm.org/D146201
---
 mlir/docs/Rationale/UsageOfConst.md           |    6 +-
 .../Analysis/FlatLinearValueConstraints.h     |  560 +++++++
 .../Analysis/Presburger/IntegerRelation.h     |    9 +-
 .../Affine/Analysis/AffineStructures.h        |  516 +------
 mlir/include/mlir/IR/AffineExprVisitor.h      |    2 +-
 mlir/include/mlir/IR/IntegerSet.h             |    2 +-
 mlir/lib/Analysis/CMakeLists.txt              |    6 +-
 .../Analysis/FlatLinearValueConstraints.cpp   | 1344 +++++++++++++++++
 .../Affine/Analysis/AffineStructures.cpp      | 1335 +---------------
 mlir/lib/IR/AffineExpr.cpp                    |    2 +-
 mlir/lib/IR/AffineMap.cpp                     |    2 +-
 mlir/test/Transforms/memref-bound-check.mlir  |    2 +-
 .../Transforms/memref-dependence-check.mlir   |    2 +-
 13 files changed, 1982 insertions(+), 1806 deletions(-)
 create mode 100644 mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
 create mode 100644 mlir/lib/Analysis/FlatLinearValueConstraints.cpp

diff --git a/mlir/docs/Rationale/UsageOfConst.md b/mlir/docs/Rationale/UsageOfConst.md
index 102b948a0eac1..7a54a4e6de7f5 100644
--- a/mlir/docs/Rationale/UsageOfConst.md
+++ b/mlir/docs/Rationale/UsageOfConst.md
@@ -235,9 +235,9 @@ if (auto *dimOp = inst->dyn_cast<DimOp>()) {
 It is much better to eliminate them entirely, and just pass around `DimOp`
 directly. For example, instead of:
 
-```C++
+```c++
 LogicalResult mlir::getIndexSet(MutableArrayRef<OpPointer<AffineForOp>> forOps,
-                                FlatAffineConstraints *domain) {
+                                FlatAffineValueConstraints *domain) {
 
 ```
 
@@ -245,7 +245,7 @@ It is a lot nicer to just have:
 
 ```c++
 LogicalResult mlir::getIndexSet(MutableArrayRef<AffineForOp> forOps,
-                                FlatAffineConstraints *domain) {
+                                FlatAffineValueConstraints *domain) {
 ```
 
 Particularly since all of the `FooOp` classes are already semantically a smart
diff --git a/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h b/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
new file mode 100644
index 0000000000000..a6900ab599386
--- /dev/null
+++ b/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
@@ -0,0 +1,560 @@
+//===- FlatLinearValueConstraints.h - Linear Constraints --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_FLATLINEARVALUECONSTRAINTS_H
+#define MLIR_ANALYSIS_FLATLINEARVALUECONSTRAINTS_H
+
+#include "mlir/Analysis/Presburger/IntegerRelation.h"
+#include "mlir/Analysis/Presburger/Matrix.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Support/LogicalResult.h"
+#include <optional>
+
+namespace mlir {
+
+class AffineMap;
+class IntegerSet;
+class MLIRContext;
+class Value;
+class MemRefType;
+struct MutableAffineMap;
+
+namespace presburger {
+class MultiAffineFunction;
+} // namespace presburger
+
+/// FlatLinearConstraints is an extension of IntegerPolyhedron. It provides an
+/// AffineExpr-based API.
+class FlatLinearConstraints : public presburger::IntegerPolyhedron {
+public:
+  /// Constructs a constraint system reserving memory for the specified number
+  /// of constraints and variables. `valArgs` are the optional SSA values
+  /// associated with each dimension/symbol. These must either be empty or match
+  /// the number of dimensions and symbols.
+  FlatLinearConstraints(unsigned numReservedInequalities,
+                        unsigned numReservedEqualities,
+                        unsigned numReservedCols, unsigned numDims,
+                        unsigned numSymbols, unsigned numLocals)
+      : IntegerPolyhedron(numReservedInequalities, numReservedEqualities,
+                          numReservedCols,
+                          presburger::PresburgerSpace::getSetSpace(
+                              numDims, numSymbols, numLocals)) {
+    assert(numReservedCols >= getNumVars() + 1);
+  }
+
+  /// Constructs a constraint system with the specified number of dimensions
+  /// and symbols. `valArgs` are the optional SSA values associated with each
+  /// dimension/symbol. These must either be empty or match the number of
+  /// dimensions and symbols.
+  FlatLinearConstraints(unsigned numDims = 0, unsigned numSymbols = 0,
+                        unsigned numLocals = 0)
+      : FlatLinearConstraints(/*numReservedInequalities=*/0,
+                              /*numReservedEqualities=*/0,
+                              /*numReservedCols=*/numDims + numSymbols +
+                                  numLocals + 1,
+                              numDims, numSymbols, numLocals) {}
+
+  FlatLinearConstraints(const IntegerPolyhedron &fac)
+      : IntegerPolyhedron(fac) {}
+
+  /// Return the kind of this object.
+  Kind getKind() const override { return Kind::FlatLinearConstraints; }
+
+  static bool classof(const IntegerRelation *cst) {
+    return cst->getKind() >= Kind::FlatLinearConstraints &&
+           cst->getKind() <= Kind::FlatAffineRelation;
+  }
+
+  /// Clones this object.
+  std::unique_ptr<FlatLinearConstraints> clone() const;
+
+  /// Adds a bound for the variable at the specified position with constraints
+  /// being drawn from the specified bound map. In case of an EQ bound, the
+  /// bound map is expected to have exactly one result. In case of a LB/UB, the
+  /// bound map may have more than one result, for each of which an inequality
+  /// is added.
+  ///
+  /// The bound can be added as open or closed by specifying isClosedBound. In
+  /// case of a LB/UB, isClosedBound = false means the bound is added internally
+  /// as a closed bound by +1/-1 respectively. In case of an EQ bound, it can
+  /// only be added as a closed bound.
+  ///
+  /// Note: The dimensions/symbols of this FlatLinearConstraints must match the
+  /// dimensions/symbols of the affine map.
+  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap,
+                         bool isClosedBound);
+
+  /// Adds a bound for the variable at the specified position with constraints
+  /// being drawn from the specified bound map. In case of an EQ bound, the
+  /// bound map is expected to have exactly one result. In case of a LB/UB, the
+  /// bound map may have more than one result, for each of which an inequality
+  /// is added.
+  /// Note: The dimensions/symbols of this FlatLinearConstraints must match the
+  /// dimensions/symbols of the affine map. By default the lower bound is closed
+  /// and the upper bound is open.
+  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap);
+
+  /// The `addBound` overload above hides the inherited overloads by default, so
+  /// we explicitly introduce them here.
+  using IntegerPolyhedron::addBound;
+
+  /// Returns the constraint system as an integer set. Returns a null integer
+  /// set if the system has no constraints, or if an integer set couldn't be
+  /// constructed as a result of a local variable's explicit representation not
+  /// being known and such a local variable appearing in any of the constraints.
+  IntegerSet getAsIntegerSet(MLIRContext *context) const;
+
+  /// Computes the lower and upper bounds of the first `num` dimensional
+  /// variables (starting at `offset`) as an affine map of the remaining
+  /// variables (dimensional and symbolic). This method is able to detect
+  /// variables as floordiv's and mod's of affine expressions of other
+  /// variables with respect to (positive) constants. Sets bound map to a
+  /// null AffineMap if such a bound can't be found (or yet unimplemented).
+  ///
+  /// By default the returned lower bounds are closed and upper bounds are open.
+  /// If `closedUb` is true, the upper bound is closed.
+  void getSliceBounds(unsigned offset, unsigned num, MLIRContext *context,
+                      SmallVectorImpl<AffineMap> *lbMaps,
+                      SmallVectorImpl<AffineMap> *ubMaps,
+                      bool closedUB = false);
+
+  /// Composes an affine map whose dimensions and symbols match one to one with
+  /// the dimensions and symbols of this FlatLinearConstraints. The results of
+  /// the map `other` are added as the leading dimensions of this constraint
+  /// system. Returns failure if `other` is a semi-affine map.
+  LogicalResult composeMatchingMap(AffineMap other);
+
+  /// Gets the lower and upper bound of the `offset` + `pos`th variable
+  /// treating [0, offset) U [offset + num, symStartPos) as dimensions and
+  /// [symStartPos, getNumDimAndSymbolVars) as symbols, and `pos` lies in
+  /// [0, num). The multi-dimensional maps in the returned pair represent the
+  /// max and min of potentially multiple affine expressions. `localExprs` holds
+  /// pre-computed AffineExpr's for all local variables in the system.
+  ///
+  /// By default the returned lower bounds are closed and upper bounds are open.
+  /// If `closedUb` is true, the upper bound is closed.
+  std::pair<AffineMap, AffineMap>
+  getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
+                        unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
+                        MLIRContext *context, bool closedUB = false) const;
+
+  /// Insert variables of the specified kind at position `pos`. Positions are
+  /// relative to the kind of variable. The coefficient columns corresponding
+  /// to the added variables are initialized to zero. `vals` are the Values
+  /// corresponding to the variables. Values should not be used with
+  /// VarKind::Local since values can only be attached to non-local variables.
+  /// Return the absolute column position (i.e., not relative to the kind of
+  /// variable) of the first added variable.
+  ///
+  /// Note: Empty Values are allowed in `vals`.
+  unsigned insertDimVar(unsigned pos, unsigned num = 1) {
+    return insertVar(VarKind::SetDim, pos, num);
+  }
+  unsigned insertSymbolVar(unsigned pos, unsigned num = 1) {
+    return insertVar(VarKind::Symbol, pos, num);
+  }
+  unsigned insertLocalVar(unsigned pos, unsigned num = 1) {
+    return insertVar(VarKind::Local, pos, num);
+  }
+
+  /// Append variables of the specified kind after the last variable of that
+  /// kind. The coefficient columns corresponding to the added variables are
+  /// initialized to zero. `vals` are the Values corresponding to the
+  /// variables. Return the absolute column position (i.e., not relative to the
+  /// kind of variable) of the first appended variable.
+  ///
+  /// Note: Empty Values are allowed in `vals`.
+  unsigned appendDimVar(unsigned num = 1) {
+    return appendVar(VarKind::SetDim, num);
+  }
+  unsigned appendSymbolVar(unsigned num = 1) {
+    return appendVar(VarKind::Symbol, num);
+  }
+  unsigned appendLocalVar(unsigned num = 1) {
+    return appendVar(VarKind::Local, num);
+  }
+
+protected:
+  using VarKind = presburger::VarKind;
+
+  /// Compute an explicit representation for local vars. For all systems coming
+  /// from MLIR integer sets, maps, or expressions where local vars were
+  /// introduced to model floordivs and mods, this always succeeds.
+  LogicalResult computeLocalVars(SmallVectorImpl<AffineExpr> &memo,
+                                 MLIRContext *context) const;
+
+  /// Given an affine map that is aligned with this constraint system:
+  /// * Flatten the map.
+  /// * Add newly introduced local columns at the beginning of this constraint
+  ///   system (local column pos 0).
+  /// * Add equalities that define the new local columns to this constraint
+  ///   system.
+  /// * Return the flattened expressions via `flattenedExprs`.
+  ///
+  /// Note: This is a shared helper function of `addLowerOrUpperBound` and
+  ///       `composeMatchingMap`.
+  LogicalResult flattenAlignedMapAndMergeLocals(
+      AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs);
+
+  /// Prints the number of constraints, dimensions, symbols and locals in the
+  /// FlatLinearConstraints. Also, prints for each variable whether there is
+  /// an SSA Value attached to it.
+  void printSpace(raw_ostream &os) const override;
+};
+
+/// FlatLinearValueConstraints represents an extension of FlatLinearConstraints
+/// where each non-local variable can have an SSA Value attached to it.
+class FlatLinearValueConstraints : public FlatLinearConstraints {
+public:
+  /// Constructs a constraint system reserving memory for the specified number
+  /// of constraints and variables. `valArgs` are the optional SSA values
+  /// associated with each dimension/symbol. These must either be empty or match
+  /// the number of dimensions and symbols.
+  FlatLinearValueConstraints(unsigned numReservedInequalities,
+                             unsigned numReservedEqualities,
+                             unsigned numReservedCols, unsigned numDims,
+                             unsigned numSymbols, unsigned numLocals,
+                             ArrayRef<std::optional<Value>> valArgs)
+      : FlatLinearConstraints(numReservedInequalities, numReservedEqualities,
+                              numReservedCols, numDims, numSymbols, numLocals) {
+    assert(valArgs.empty() || valArgs.size() == getNumDimAndSymbolVars());
+    values.reserve(numReservedCols);
+    if (valArgs.empty())
+      values.resize(getNumDimAndSymbolVars(), std::nullopt);
+    else
+      values.append(valArgs.begin(), valArgs.end());
+  }
+
+  /// Constructs a constraint system reserving memory for the specified number
+  /// of constraints and variables. `valArgs` are the optional SSA values
+  /// associated with each dimension/symbol. These must either be empty or match
+  /// the number of dimensions and symbols.
+  FlatLinearValueConstraints(unsigned numReservedInequalities,
+                             unsigned numReservedEqualities,
+                             unsigned numReservedCols, unsigned numDims,
+                             unsigned numSymbols, unsigned numLocals,
+                             ArrayRef<Value> valArgs)
+      : FlatLinearConstraints(numReservedInequalities, numReservedEqualities,
+                              numReservedCols, numDims, numSymbols, numLocals) {
+    assert(valArgs.empty() || valArgs.size() == getNumDimAndSymbolVars());
+    values.reserve(numReservedCols);
+    if (valArgs.empty())
+      values.resize(getNumDimAndSymbolVars(), std::nullopt);
+    else
+      values.append(valArgs.begin(), valArgs.end());
+  }
+
+  /// Constructs a constraint system with the specified number of dimensions
+  /// and symbols. `valArgs` are the optional SSA values associated with each
+  /// dimension/symbol. These must either be empty or match the number of
+  /// dimensions and symbols.
+  FlatLinearValueConstraints(unsigned numDims, unsigned numSymbols,
+                             unsigned numLocals,
+                             ArrayRef<std::optional<Value>> valArgs)
+      : FlatLinearValueConstraints(/*numReservedInequalities=*/0,
+                                   /*numReservedEqualities=*/0,
+                                   /*numReservedCols=*/numDims + numSymbols +
+                                       numLocals + 1,
+                                   numDims, numSymbols, numLocals, valArgs) {}
+
+  /// Constructs a constraint system with the specified number of dimensions
+  /// and symbols. `valArgs` are the optional SSA values associated with each
+  /// dimension/symbol. These must either be empty or match the number of
+  /// dimensions and symbols.
+  FlatLinearValueConstraints(unsigned numDims = 0, unsigned numSymbols = 0,
+                             unsigned numLocals = 0,
+                             ArrayRef<Value> valArgs = {})
+      : FlatLinearValueConstraints(/*numReservedInequalities=*/0,
+                                   /*numReservedEqualities=*/0,
+                                   /*numReservedCols=*/numDims + numSymbols +
+                                       numLocals + 1,
+                                   numDims, numSymbols, numLocals, valArgs) {}
+
+  FlatLinearValueConstraints(const IntegerPolyhedron &fac,
+                             ArrayRef<std::optional<Value>> valArgs = {})
+      : FlatLinearConstraints(fac) {
+    assert(valArgs.empty() || valArgs.size() == getNumDimAndSymbolVars());
+    if (valArgs.empty())
+      values.resize(getNumDimAndSymbolVars(), std::nullopt);
+    else
+      values.append(valArgs.begin(), valArgs.end());
+  }
+
+  /// Creates an affine constraint system from an IntegerSet.
+  explicit FlatLinearValueConstraints(IntegerSet set, ValueRange operands = {});
+
+  // Construct a hyperrectangular constraint set from ValueRanges that represent
+  // induction variables, lower and upper bounds. `ivs`, `lbs` and `ubs` are
+  // expected to match one to one. The order of variables and constraints is:
+  //
+  // ivs | lbs | ubs | eq/ineq
+  // ----+-----+-----+---------
+  //   1   -1     0      >= 0
+  // ----+-----+-----+---------
+  //  -1    0     1      >= 0
+  //
+  // All dimensions as set as VarKind::SetDim.
+  static FlatLinearValueConstraints
+  getHyperrectangular(ValueRange ivs, ValueRange lbs, ValueRange ubs);
+
+  /// Return the kind of this object.
+  Kind getKind() const override { return Kind::FlatLinearValueConstraints; }
+
+  static bool classof(const IntegerRelation *cst) {
+    return cst->getKind() >= Kind::FlatLinearValueConstraints &&
+           cst->getKind() <= Kind::FlatAffineRelation;
+  }
+
+  /// Replaces the contents of this FlatLinearValueConstraints with `other`.
+  void clearAndCopyFrom(const IntegerRelation &other) override;
+
+  /// Adds a constant bound for the variable associated with the given Value.
+  void addBound(BoundType type, Value val, int64_t value);
+  using FlatLinearConstraints::addBound;
+
+  /// Returns the Value associated with the pos^th variable. Asserts if
+  /// no Value variable was associated.
+  inline Value getValue(unsigned pos) const {
+    assert(pos < getNumDimAndSymbolVars() && "Invalid position");
+    assert(hasValue(pos) && "variable's Value not set");
+    return *values[pos];
+  }
+
+  /// Returns the Values associated with variables in range [start, end).
+  /// Asserts if no Value was associated with one of these variables.
+  inline void getValues(unsigned start, unsigned end,
+                        SmallVectorImpl<Value> *values) const {
+    assert(end <= getNumDimAndSymbolVars() && "invalid end position");
+    assert(start <= end && "invalid start position");
+    values->clear();
+    values->reserve(end - start);
+    for (unsigned i = start; i < end; i++)
+      values->push_back(getValue(i));
+  }
+  inline void getAllValues(SmallVectorImpl<Value> *values) const {
+    getValues(0, getNumDimAndSymbolVars(), values);
+  }
+
+  inline ArrayRef<std::optional<Value>> getMaybeValues() const {
+    return {values.data(), values.size()};
+  }
+
+  inline ArrayRef<std::optional<Value>>
+  getMaybeValues(presburger::VarKind kind) const {
+    assert(kind != VarKind::Local &&
+           "Local variables do not have any value attached to them.");
+    return {values.data() + getVarKindOffset(kind), getNumVarKind(kind)};
+  }
+
+  /// Returns true if the pos^th variable has an associated Value.
+  inline bool hasValue(unsigned pos) const {
+    assert(pos < getNumDimAndSymbolVars() && "Invalid position");
+    return values[pos].has_value();
+  }
+
+  /// Returns true if at least one variable has an associated Value.
+  bool hasValues() const;
+
+  unsigned appendDimVar(ValueRange vals);
+  using FlatLinearConstraints::appendDimVar;
+
+  unsigned appendSymbolVar(ValueRange vals);
+  using FlatLinearConstraints::appendSymbolVar;
+
+  unsigned insertDimVar(unsigned pos, ValueRange vals);
+  using FlatLinearConstraints::insertDimVar;
+
+  unsigned insertSymbolVar(unsigned pos, ValueRange vals);
+  using FlatLinearConstraints::insertSymbolVar;
+
+  unsigned insertVar(presburger::VarKind kind, unsigned pos,
+                     unsigned num = 1) override;
+  unsigned insertVar(presburger::VarKind kind, unsigned pos, ValueRange vals);
+
+  /// Removes variables in the column range [varStart, varLimit), and copies any
+  /// remaining valid data into place, updates member variables, and resizes
+  /// arrays as needed.
+  void removeVarRange(presburger::VarKind kind, unsigned varStart,
+                      unsigned varLimit) override;
+  using IntegerPolyhedron::removeVarRange;
+
+  /// Sets the Value associated with the pos^th variable.
+  inline void setValue(unsigned pos, Value val) {
+    assert(pos < getNumDimAndSymbolVars() && "invalid var position");
+    values[pos] = val;
+  }
+
+  /// Sets the Values associated with the variables in the range [start, end).
+  /// The range must contain only dim and symbol variables.
+  void setValues(unsigned start, unsigned end, ArrayRef<Value> values) {
+    assert(end <= getNumVars() && "invalid end position");
+    assert(start <= end && "invalid start position");
+    assert(values.size() == end - start &&
+           "value should be provided for each variable in the range.");
+    for (unsigned i = start; i < end; ++i)
+      setValue(i, values[i - start]);
+  }
+
+  /// Looks up the position of the variable with the specified Value. Returns
+  /// true if found (false otherwise). `pos` is set to the (column) position of
+  /// the variable.
+  bool findVar(Value val, unsigned *pos) const;
+
+  /// Returns true if a variable with the specified Value exists, false
+  /// otherwise.
+  bool containsVar(Value val) const;
+
+  /// Projects out the variable that is associate with Value.
+  void projectOut(Value val);
+  using IntegerPolyhedron::projectOut;
+
+  /// Swap the posA^th variable with the posB^th variable.
+  void swapVar(unsigned posA, unsigned posB) override;
+
+  /// Prints the number of constraints, dimensions, symbols and locals in the
+  /// FlatAffineValueConstraints. Also, prints for each variable whether there
+  /// is an SSA Value attached to it.
+  void printSpace(raw_ostream &os) const override;
+
+  /// Align `map` with this constraint system based on `operands`. Each operand
+  /// must already have a corresponding dim/symbol in this constraint system.
+  AffineMap computeAlignedMap(AffineMap map, ValueRange operands) const;
+
+  /// Merge and align the variables of `this` and `other` starting at
+  /// `offset`, so that both constraint systems get the union of the contained
+  /// variables that is dimension-wise and symbol-wise unique; both
+  /// constraint systems are updated so that they have the union of all
+  /// variables, with `this`'s original variables appearing first followed
+  /// by any of `other`'s variables that didn't appear in `this`. Local
+  /// variables in `other` that have the same division representation as local
+  /// variables in `this` are merged into one.
+  //  E.g.: Input: `this`  has (%i, %j) [%M, %N]
+  //               `other` has (%k, %j) [%P, %N, %M]
+  //        Output: both `this`, `other` have (%i, %j, %k) [%M, %N, %P]
+  //
+  void mergeAndAlignVarsWithOther(unsigned offset,
+                                  FlatLinearValueConstraints *other);
+
+  /// Merge and align symbols of `this` and `other` such that both get union of
+  /// of symbols that are unique. Symbols in `this` and `other` should be
+  /// unique. Symbols with Value as `None` are considered to be inequal to all
+  /// other symbols.
+  void mergeSymbolVars(FlatLinearValueConstraints &other);
+
+  /// Returns true if this constraint system and `other` are in the same
+  /// space, i.e., if they are associated with the same set of variables,
+  /// appearing in the same order. Returns false otherwise.
+  bool areVarsAlignedWithOther(const FlatLinearConstraints &other);
+
+  /// Updates the constraints to be the smallest bounding (enclosing) box that
+  /// contains the points of `this` set and that of `other`, with the symbols
+  /// being treated specially. For each of the dimensions, the min of the lower
+  /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed
+  /// to determine such a bounding box. `other` is expected to have the same
+  /// dimensional variables as this constraint system (in the same order).
+  ///
+  /// E.g.:
+  /// 1) this   = {0 <= d0 <= 127},
+  ///    other  = {16 <= d0 <= 192},
+  ///    output = {0 <= d0 <= 192}
+  /// 2) this   = {s0 + 5 <= d0 <= s0 + 20},
+  ///    other  = {s0 + 1 <= d0 <= s0 + 9},
+  ///    output = {s0 + 1 <= d0 <= s0 + 20}
+  /// 3) this   = {0 <= d0 <= 5, 1 <= d1 <= 9}
+  ///    other  = {2 <= d0 <= 6, 5 <= d1 <= 15},
+  ///    output = {0 <= d0 <= 6, 1 <= d1 <= 15}
+  LogicalResult unionBoundingBox(const FlatLinearValueConstraints &other);
+  using IntegerPolyhedron::unionBoundingBox;
+
+protected:
+  /// Eliminates the variable at the specified position using Fourier-Motzkin
+  /// variable elimination, but uses Gaussian elimination if there is an
+  /// equality involving that variable. If the result of the elimination is
+  /// integer exact, `*isResultIntegerExact` is set to true. If `darkShadow` is
+  /// set to true, a potential under approximation (subset) of the rational
+  /// shadow / exact integer shadow is computed.
+  // See implementation comments for more details.
+  void fourierMotzkinEliminate(unsigned pos, bool darkShadow = false,
+                               bool *isResultIntegerExact = nullptr) override;
+
+  /// Returns false if the fields corresponding to various variable counts, or
+  /// equality/inequality buffer sizes aren't consistent; true otherwise. This
+  /// is meant to be used within an assert internally.
+  bool hasConsistentState() const override;
+
+  /// Values corresponding to the (column) non-local variables of this
+  /// constraint system appearing in the order the variables correspond to
+  /// columns. Variables that aren't associated with any Value are set to
+  /// None.
+  SmallVector<std::optional<Value>, 8> values;
+};
+
+/// Flattens 'expr' into 'flattenedExpr', which contains the coefficients of the
+/// dimensions, symbols, and additional variables that represent floor divisions
+/// of dimensions, symbols, and in turn other floor divisions.  Returns failure
+/// if 'expr' could not be flattened (i.e., semi-affine is not yet handled).
+/// 'cst' contains constraints that connect newly introduced local variables
+/// to existing dimensional and symbolic variables. See documentation for
+/// AffineExprFlattener on how mod's and div's are flattened.
+LogicalResult getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
+                                     unsigned numSymbols,
+                                     SmallVectorImpl<int64_t> *flattenedExpr,
+                                     FlatLinearConstraints *cst = nullptr);
+
+/// Flattens the result expressions of the map to their corresponding flattened
+/// forms and set in 'flattenedExprs'. Returns failure if any expression in the
+/// map could not be flattened (i.e., semi-affine is not yet handled). 'cst'
+/// contains constraints that connect newly introduced local variables to
+/// existing dimensional and / symbolic variables. See documentation for
+/// AffineExprFlattener on how mod's and div's are flattened. For all affine
+/// expressions that share the same operands (like those of an affine map), this
+/// method should be used instead of repeatedly calling getFlattenedAffineExpr
+/// since local variables added to deal with div's and mod's will be reused
+/// across expressions.
+LogicalResult
+getFlattenedAffineExprs(AffineMap map,
+                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+                        FlatLinearConstraints *cst = nullptr);
+LogicalResult
+getFlattenedAffineExprs(IntegerSet set,
+                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+                        FlatLinearConstraints *cst = nullptr);
+
+LogicalResult
+getMultiAffineFunctionFromMap(AffineMap map,
+                              presburger::MultiAffineFunction &multiAff);
+
+/// Re-indexes the dimensions and symbols of an affine map with given `operands`
+/// values to align with `dims` and `syms` values.
+///
+/// Each dimension/symbol of the map, bound to an operand `o`, is replaced with
+/// dimension `i`, where `i` is the position of `o` within `dims`. If `o` is not
+/// in `dims`, replace it with symbol `i`, where `i` is the position of `o`
+/// within `syms`. If `o` is not in `syms` either, replace it with a new symbol.
+///
+/// Note: If a value appears multiple times as a dimension/symbol (or both), all
+/// corresponding dim/sym expressions are replaced with the first dimension
+/// bound to that value (or first symbol if no such dimension exists).
+///
+/// The resulting affine map has `dims.size()` many dimensions and at least
+/// `syms.size()` many symbols.
+///
+/// The SSA values of the symbols of the resulting map are optionally returned
+/// via `newSyms`. This is a concatenation of `syms` with the SSA values of the
+/// newly added symbols.
+///
+/// Note: As part of this re-indexing, dimensions may turn into symbols, or vice
+/// versa.
+AffineMap alignAffineMapWithValues(AffineMap map, ValueRange operands,
+                                   ValueRange dims, ValueRange syms,
+                                   SmallVector<Value> *newSyms = nullptr);
+
+} // namespace mlir
+
+#endif // MLIR_ANALYSIS_FLATLINEARVALUECONSTRAINTS_H
diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index 347be26325e5a..8b0c2a561cfb8 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -54,10 +54,12 @@ class IntegerRelation {
 public:
   /// All derived classes of IntegerRelation.
   enum class Kind {
-    FlatAffineConstraints,
-    FlatAffineValueConstraints,
     IntegerRelation,
     IntegerPolyhedron,
+    FlatLinearConstraints,
+    FlatLinearValueConstraints,
+    FlatAffineValueConstraints,
+    FlatAffineRelation
   };
 
   /// Constructs a relation reserving memory for the specified number
@@ -848,7 +850,8 @@ class IntegerPolyhedron : public IntegerRelation {
   Kind getKind() const override { return Kind::IntegerPolyhedron; }
 
   static bool classof(const IntegerRelation *cst) {
-    return cst->getKind() == Kind::IntegerPolyhedron;
+    return cst->getKind() >= Kind::IntegerPolyhedron &&
+           cst->getKind() <= Kind::FlatAffineRelation;
   }
 
   // Clones this object.
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h b/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
index 1b302f55422d8..6249428fb8e15 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_AFFINE_ANALYSIS_AFFINESTRUCTURES_H
 #define MLIR_DIALECT_AFFINE_ANALYSIS_AFFINESTRUCTURES_H
 
+#include "mlir/Analysis/FlatLinearValueConstraints.h"
 #include "mlir/Analysis/Presburger/IntegerRelation.h"
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/IR/AffineExpr.h"
@@ -38,117 +39,20 @@ namespace presburger {
 class MultiAffineFunction;
 } // namespace presburger
 
-/// FlatAffineValueConstraints represents an extension of IntegerPolyhedron
-/// where each non-local variable can have an SSA Value attached to it.
-class FlatAffineValueConstraints : public presburger::IntegerPolyhedron {
+/// FlatAffineValueConstraints is an extension of FlatLinearValueConstraints
+/// with helper functions for Affine dialect ops.
+class FlatAffineValueConstraints : public FlatLinearValueConstraints {
 public:
-  /// Constructs a constraint system reserving memory for the specified number
-  /// of constraints and variables. `valArgs` are the optional SSA values
-  /// associated with each dimension/symbol. These must either be empty or match
-  /// the number of dimensions and symbols.
-  FlatAffineValueConstraints(unsigned numReservedInequalities,
-                             unsigned numReservedEqualities,
-                             unsigned numReservedCols, unsigned numDims,
-                             unsigned numSymbols, unsigned numLocals,
-                             ArrayRef<std::optional<Value>> valArgs)
-      : IntegerPolyhedron(numReservedInequalities, numReservedEqualities,
-                          numReservedCols,
-                          presburger::PresburgerSpace::getSetSpace(
-                              numDims, numSymbols, numLocals)) {
-    assert(numReservedCols >= getNumVars() + 1);
-    assert(valArgs.empty() || valArgs.size() == getNumDimAndSymbolVars());
-    values.reserve(numReservedCols);
-    if (valArgs.empty())
-      values.resize(getNumDimAndSymbolVars(), std::nullopt);
-    else
-      values.append(valArgs.begin(), valArgs.end());
-  }
-
-  /// Constructs a constraint system reserving memory for the specified number
-  /// of constraints and variables. `valArgs` are the optional SSA values
-  /// associated with each dimension/symbol. These must either be empty or match
-  /// the number of dimensions and symbols.
-  FlatAffineValueConstraints(unsigned numReservedInequalities,
-                             unsigned numReservedEqualities,
-                             unsigned numReservedCols, unsigned numDims,
-                             unsigned numSymbols, unsigned numLocals,
-                             ArrayRef<Value> valArgs = {})
-      : IntegerPolyhedron(numReservedInequalities, numReservedEqualities,
-                          numReservedCols,
-                          presburger::PresburgerSpace::getSetSpace(
-                              numDims, numSymbols, numLocals)) {
-    assert(numReservedCols >= getNumVars() + 1);
-    assert(valArgs.empty() || valArgs.size() == getNumDimAndSymbolVars());
-    values.reserve(numReservedCols);
-    if (valArgs.empty())
-      values.resize(getNumDimAndSymbolVars(), std::nullopt);
-    else
-      values.append(valArgs.begin(), valArgs.end());
-  }
+  using FlatLinearValueConstraints::FlatLinearValueConstraints;
 
-  /// Constructs a constraint system with the specified number of dimensions
-  /// and symbols. `valArgs` are the optional SSA values associated with each
-  /// dimension/symbol. These must either be empty or match the number of
-  /// dimensions and symbols.
-  FlatAffineValueConstraints(unsigned numDims, unsigned numSymbols,
-                             unsigned numLocals,
-                             ArrayRef<std::optional<Value>> valArgs)
-      : FlatAffineValueConstraints(/*numReservedInequalities=*/0,
-                                   /*numReservedEqualities=*/0,
-                                   /*numReservedCols=*/numDims + numSymbols +
-                                       numLocals + 1,
-                                   numDims, numSymbols, numLocals, valArgs) {}
-
-  /// Constructs a constraint system with the specified number of dimensions
-  /// and symbols. `valArgs` are the optional SSA values associated with each
-  /// dimension/symbol. These must either be empty or match the number of
-  /// dimensions and symbols.
-  FlatAffineValueConstraints(unsigned numDims = 0, unsigned numSymbols = 0,
-                             unsigned numLocals = 0,
-                             ArrayRef<Value> valArgs = {})
-      : FlatAffineValueConstraints(/*numReservedInequalities=*/0,
-                                   /*numReservedEqualities=*/0,
-                                   /*numReservedCols=*/numDims + numSymbols +
-                                       numLocals + 1,
-                                   numDims, numSymbols, numLocals, valArgs) {}
-
-  FlatAffineValueConstraints(const IntegerPolyhedron &fac,
-                             ArrayRef<std::optional<Value>> valArgs = {})
-      : IntegerPolyhedron(fac) {
-    assert(valArgs.empty() || valArgs.size() == getNumDimAndSymbolVars());
-    if (valArgs.empty())
-      values.resize(getNumDimAndSymbolVars(), std::nullopt);
-    else
-      values.append(valArgs.begin(), valArgs.end());
-  }
-
-  /// Creates an affine constraint system from an IntegerSet.
-  explicit FlatAffineValueConstraints(IntegerSet set, ValueRange operands = {});
-
-  // Construct a hyperrectangular constraint set from ValueRanges that represent
-  // induction variables, lower and upper bounds. `ivs`, `lbs` and `ubs` are
-  // expected to match one to one. The order of variables and constraints is:
-  //
-  // ivs | lbs | ubs | eq/ineq
-  // ----+-----+-----+---------
-  //   1   -1     0      >= 0
-  // ----+-----+-----+---------
-  //  -1    0     1      >= 0
-  //
-  // All dimensions as set as VarKind::SetDim.
-  static FlatAffineValueConstraints
-  getHyperrectangular(ValueRange ivs, ValueRange lbs, ValueRange ubs);
-
-  /// Return the kind of this FlatAffineConstraints.
+  /// Return the kind of this object.
   Kind getKind() const override { return Kind::FlatAffineValueConstraints; }
 
   static bool classof(const IntegerRelation *cst) {
-    return cst->getKind() == Kind::FlatAffineValueConstraints;
+    return cst->getKind() >= Kind::FlatAffineValueConstraints &&
+           cst->getKind() <= Kind::FlatAffineRelation;
   }
 
-  /// Clones this object.
-  std::unique_ptr<FlatAffineValueConstraints> clone() const;
-
   /// Adds constraints (lower and upper bounds) for the specified 'affine.for'
   /// operation's Value using IR information stored in its bound maps. The
   /// right variable is first looked up using `forOp`'s Value. Asserts if the
@@ -191,32 +95,6 @@ class FlatAffineValueConstraints : public presburger::IntegerPolyhedron {
   /// the columns in the current one regarding numbers and values.
   void addAffineIfOpDomain(AffineIfOp ifOp);
 
-  /// Adds a bound for the variable at the specified position with constraints
-  /// being drawn from the specified bound map. In case of an EQ bound, the
-  /// bound map is expected to have exactly one result. In case of a LB/UB, the
-  /// bound map may have more than one result, for each of which an inequality
-  /// is added.
-  ///
-  /// The bound can be added as open or closed by specifying isClosedBound. In
-  /// case of a LB/UB, isClosedBound = false means the bound is added internally
-  /// as a closed bound by +1/-1 respectively. In case of an EQ bound, it can
-  /// only be added as a closed bound.
-  ///
-  /// Note: The dimensions/symbols of this FlatAffineConstraints must match the
-  /// dimensions/symbols of the affine map.
-  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap,
-                         bool isClosedBound);
-
-  /// Adds a bound for the variable at the specified position with constraints
-  /// being drawn from the specified bound map. In case of an EQ bound, the
-  /// bound map is expected to have exactly one result. In case of a LB/UB, the
-  /// bound map may have more than one result, for each of which an inequality
-  /// is added.
-  /// Note: The dimensions/symbols of this FlatAffineConstraints must match the
-  /// dimensions/symbols of the affine map. By default the lower bound is closed
-  /// and the upper bound is open.
-  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap);
-
   /// Adds a bound for the variable at the specified position with constraints
   /// being drawn from the specified bound map and operands. In case of an
   /// EQ bound, the  bound map is expected to have exactly one result. In case
@@ -224,62 +102,15 @@ class FlatAffineValueConstraints : public presburger::IntegerPolyhedron {
   /// an inequality is added.
   LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap,
                          ValueRange operands);
+  using FlatLinearValueConstraints::addBound;
 
-  /// Adds a constant bound for the variable associated with the given Value.
-  void addBound(BoundType type, Value val, int64_t value);
-
-  /// The `addBound` overload above hides the inherited overloads by default, so
-  /// we explicitly introduce them here.
-  using IntegerPolyhedron::addBound;
-
-  /// Returns the constraint system as an integer set. Returns a null integer
-  /// set if the system has no constraints, or if an integer set couldn't be
-  /// constructed as a result of a local variable's explicit representation not
-  /// being known and such a local variable appearing in any of the constraints.
-  IntegerSet getAsIntegerSet(MLIRContext *context) const;
-
-  /// Computes the lower and upper bounds of the first `num` dimensional
-  /// variables (starting at `offset`) as an affine map of the remaining
-  /// variables (dimensional and symbolic). This method is able to detect
-  /// variables as floordiv's and mod's of affine expressions of other
-  /// variables with respect to (positive) constants. Sets bound map to a
-  /// null AffineMap if such a bound can't be found (or yet unimplemented).
-  ///
-  /// By default the returned lower bounds are closed and upper bounds are open.
-  /// If `closedUb` is true, the upper bound is closed.
-  void getSliceBounds(unsigned offset, unsigned num, MLIRContext *context,
-                      SmallVectorImpl<AffineMap> *lbMaps,
-                      SmallVectorImpl<AffineMap> *ubMaps,
-                      bool closedUB = false);
-
-  /// Composes an affine map whose dimensions and symbols match one to one with
-  /// the dimensions and symbols of this FlatAffineConstraints. The results of
-  /// the map `other` are added as the leading dimensions of this constraint
-  /// system. Returns failure if `other` is a semi-affine map.
-  LogicalResult composeMatchingMap(AffineMap other);
-
-  /// Gets the lower and upper bound of the `offset` + `pos`th variable
-  /// treating [0, offset) U [offset + num, symStartPos) as dimensions and
-  /// [symStartPos, getNumDimAndSymbolVars) as symbols, and `pos` lies in
-  /// [0, num). The multi-dimensional maps in the returned pair represent the
-  /// max and min of potentially multiple affine expressions. `localExprs` holds
-  /// pre-computed AffineExpr's for all local variables in the system.
-  ///
-  /// By default the returned lower bounds are closed and upper bounds are open.
-  /// If `closedUb` is true, the upper bound is closed.
-  std::pair<AffineMap, AffineMap>
-  getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
-                        unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
-                        MLIRContext *context, bool closedUB = false) const;
-
-  /// Returns the bound for the variable at `pos` from the inequality at
-  /// `ineqPos` as a 1-d affine value map (affine map + operands). The returned
-  /// affine value map can either be a lower bound or an upper bound depending
-  /// on the sign of atIneq(ineqPos, pos). Asserts if the row at `ineqPos` does
-  /// not involve the `pos`th variable.
-  void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos,
-                               AffineValueMap &vmap,
-                               MLIRContext *context) const;
+  /// Add the specified values as a dim or symbol var depending on its nature,
+  /// if it already doesn't exist in the system. `val` has to be either a
+  /// terminal symbol or a loop IV, i.e., it cannot be the result affine.apply
+  /// of any symbols or loop IVs. The variable is added to the end of the
+  /// existing dims or symbols. Additional information on the variable is
+  /// extracted from the IR and added to the constraint system.
+  void addInductionVarOrTerminalSymbol(Value val);
 
   /// Adds slice lower bounds represented by lower bounds in `lbMaps` and upper
   /// bounds in `ubMaps` to each variable in the constraint system which has
@@ -292,79 +123,17 @@ class FlatAffineValueConstraints : public presburger::IntegerPolyhedron {
                                ArrayRef<AffineMap> ubMaps,
                                ArrayRef<Value> operands);
 
-  /// Looks up the position of the variable with the specified Value. Returns
-  /// true if found (false otherwise). `pos` is set to the (column) position of
-  /// the variable.
-  bool findVar(Value val, unsigned *pos) const;
-
-  /// Returns true if an variable with the specified Value exists, false
-  /// otherwise.
-  bool containsVar(Value val) const;
-
-  /// Swap the posA^th variable with the posB^th variable.
-  void swapVar(unsigned posA, unsigned posB) override;
-
-  /// Insert variables of the specified kind at position `pos`. Positions are
-  /// relative to the kind of variable. The coefficient columns corresponding
-  /// to the added variables are initialized to zero. `vals` are the Values
-  /// corresponding to the variables. Values should not be used with
-  /// VarKind::Local since values can only be attached to non-local variables.
-  /// Return the absolute column position (i.e., not relative to the kind of
-  /// variable) of the first added variable.
-  ///
-  /// Note: Empty Values are allowed in `vals`.
-  unsigned insertDimVar(unsigned pos, unsigned num = 1) {
-    return insertVar(VarKind::SetDim, pos, num);
-  }
-  unsigned insertSymbolVar(unsigned pos, unsigned num = 1) {
-    return insertVar(VarKind::Symbol, pos, num);
-  }
-  unsigned insertLocalVar(unsigned pos, unsigned num = 1) {
-    return insertVar(VarKind::Local, pos, num);
-  }
-  unsigned insertDimVar(unsigned pos, ValueRange vals);
-  unsigned insertSymbolVar(unsigned pos, ValueRange vals);
-  unsigned insertVar(presburger::VarKind kind, unsigned pos,
-                     unsigned num = 1) override;
-  unsigned insertVar(presburger::VarKind kind, unsigned pos, ValueRange vals);
-
-  /// Append variables of the specified kind after the last variable of that
-  /// kind. The coefficient columns corresponding to the added variables are
-  /// initialized to zero. `vals` are the Values corresponding to the
-  /// variables. Return the absolute column position (i.e., not relative to the
-  /// kind of variable) of the first appended variable.
-  ///
-  /// Note: Empty Values are allowed in `vals`.
-  unsigned appendDimVar(ValueRange vals);
-  unsigned appendSymbolVar(ValueRange vals);
-  unsigned appendDimVar(unsigned num = 1) {
-    return appendVar(VarKind::SetDim, num);
-  }
-  unsigned appendSymbolVar(unsigned num = 1) {
-    return appendVar(VarKind::Symbol, num);
-  }
-  unsigned appendLocalVar(unsigned num = 1) {
-    return appendVar(VarKind::Local, num);
-  }
-
-  /// Removes variables in the column range [varStart, varLimit), and copies any
-  /// remaining valid data into place, updates member variables, and resizes
-  /// arrays as needed.
-  void removeVarRange(presburger::VarKind kind, unsigned varStart,
-                      unsigned varLimit) override;
-  using IntegerPolyhedron::removeVarRange;
-
-  /// Add the specified values as a dim or symbol var depending on its nature,
-  /// if it already doesn't exist in the system. `val` has to be either a
-  /// terminal symbol or a loop IV, i.e., it cannot be the result affine.apply
-  /// of any symbols or loop IVs. The variable is added to the end of the
-  /// existing dims or symbols. Additional information on the variable is
-  /// extracted from the IR and added to the constraint system.
-  void addInductionVarOrTerminalSymbol(Value val);
+  /// Changes all symbol variables which are loop IVs to dim variables.
+  void convertLoopIVSymbolsToDims();
 
-  /// Align `map` with this constraint system based on `operands`. Each operand
-  /// must already have a corresponding dim/symbol in this constraint system.
-  AffineMap computeAlignedMap(AffineMap map, ValueRange operands) const;
+  /// Returns the bound for the variable at `pos` from the inequality at
+  /// `ineqPos` as a 1-d affine value map (affine map + operands). The returned
+  /// affine value map can either be a lower bound or an upper bound depending
+  /// on the sign of atIneq(ineqPos, pos). Asserts if the row at `ineqPos` does
+  /// not involve the `pos`th variable.
+  void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos,
+                               AffineValueMap &vmap,
+                               MLIRContext *context) const;
 
   /// Composes the affine value map with this FlatAffineValueConstrains, adding
   /// the results of the map as dimensions at the front
@@ -373,168 +142,10 @@ class FlatAffineValueConstraints : public presburger::IntegerPolyhedron {
   ///
   /// Returns failure if the composition fails (when vMap is a semi-affine map).
   /// The vMap's operand Value's are used to look up the right positions in
-  /// the FlatAffineConstraints with which to associate. Every operand of vMap
-  /// should have a matching dim/symbol column in this constraint system (with
-  /// the same associated Value).
+  /// the FlatAffineValueConstraints with which to associate. Every operand of
+  /// vMap should have a matching dim/symbol column in this constraint system
+  /// (with the same associated Value).
   LogicalResult composeMap(const AffineValueMap *vMap);
-
-  /// Projects out the variable that is associate with Value.
-  void projectOut(Value val);
-  using IntegerPolyhedron::projectOut;
-
-  /// Changes all symbol variables which are loop IVs to dim variables.
-  void convertLoopIVSymbolsToDims();
-
-  /// Updates the constraints to be the smallest bounding (enclosing) box that
-  /// contains the points of `this` set and that of `other`, with the symbols
-  /// being treated specially. For each of the dimensions, the min of the lower
-  /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed
-  /// to determine such a bounding box. `other` is expected to have the same
-  /// dimensional variables as this constraint system (in the same order).
-  ///
-  /// E.g.:
-  /// 1) this   = {0 <= d0 <= 127},
-  ///    other  = {16 <= d0 <= 192},
-  ///    output = {0 <= d0 <= 192}
-  /// 2) this   = {s0 + 5 <= d0 <= s0 + 20},
-  ///    other  = {s0 + 1 <= d0 <= s0 + 9},
-  ///    output = {s0 + 1 <= d0 <= s0 + 20}
-  /// 3) this   = {0 <= d0 <= 5, 1 <= d1 <= 9}
-  ///    other  = {2 <= d0 <= 6, 5 <= d1 <= 15},
-  ///    output = {0 <= d0 <= 6, 1 <= d1 <= 15}
-  LogicalResult unionBoundingBox(const FlatAffineValueConstraints &other);
-  using IntegerPolyhedron::unionBoundingBox;
-
-  /// Merge and align the variables of `this` and `other` starting at
-  /// `offset`, so that both constraint systems get the union of the contained
-  /// variables that is dimension-wise and symbol-wise unique; both
-  /// constraint systems are updated so that they have the union of all
-  /// variables, with `this`'s original variables appearing first followed
-  /// by any of `other`'s variables that didn't appear in `this`. Local
-  /// variables in `other` that have the same division representation as local
-  /// variables in `this` are merged into one.
-  //  E.g.: Input: `this`  has (%i, %j) [%M, %N]
-  //               `other` has (%k, %j) [%P, %N, %M]
-  //        Output: both `this`, `other` have (%i, %j, %k) [%M, %N, %P]
-  //
-  void mergeAndAlignVarsWithOther(unsigned offset,
-                                  FlatAffineValueConstraints *other);
-
-  /// Returns true if this constraint system and `other` are in the same
-  /// space, i.e., if they are associated with the same set of variables,
-  /// appearing in the same order. Returns false otherwise.
-  bool areVarsAlignedWithOther(const FlatAffineValueConstraints &other);
-
-  /// Replaces the contents of this FlatAffineValueConstraints with `other`.
-  void clearAndCopyFrom(const IntegerRelation &other) override;
-
-  /// Returns the Value associated with the pos^th variable. Asserts if
-  /// no Value variable was associated.
-  inline Value getValue(unsigned pos) const {
-    assert(pos < getNumDimAndSymbolVars() && "Invalid position");
-    assert(hasValue(pos) && "variable's Value not set");
-    return *values[pos];
-  }
-
-  /// Returns true if the pos^th variable has an associated Value.
-  inline bool hasValue(unsigned pos) const {
-    assert(pos < getNumDimAndSymbolVars() && "Invalid position");
-    return values[pos].has_value();
-  }
-
-  /// Returns true if at least one variable has an associated Value.
-  bool hasValues() const;
-
-  /// Returns the Values associated with variables in range [start, end).
-  /// Asserts if no Value was associated with one of these variables.
-  inline void getValues(unsigned start, unsigned end,
-                        SmallVectorImpl<Value> *values) const {
-    assert(end <= getNumDimAndSymbolVars() && "invalid end position");
-    assert(start <= end && "invalid start position");
-    values->clear();
-    values->reserve(end - start);
-    for (unsigned i = start; i < end; i++)
-      values->push_back(getValue(i));
-  }
-  inline void getAllValues(SmallVectorImpl<Value> *values) const {
-    getValues(0, getNumDimAndSymbolVars(), values);
-  }
-
-  inline ArrayRef<std::optional<Value>> getMaybeValues() const {
-    return {values.data(), values.size()};
-  }
-
-  inline ArrayRef<std::optional<Value>>
-  getMaybeValues(presburger::VarKind kind) const {
-    assert(kind != VarKind::Local &&
-           "Local variables do not have any value attached to them.");
-    return {values.data() + getVarKindOffset(kind), getNumVarKind(kind)};
-  }
-
-  /// Sets the Value associated with the pos^th variable.
-  inline void setValue(unsigned pos, Value val) {
-    assert(pos < getNumDimAndSymbolVars() && "invalid var position");
-    values[pos] = val;
-  }
-
-  /// Sets the Values associated with the variables in the range [start, end).
-  /// The range must contain only dim and symbol variables.
-  void setValues(unsigned start, unsigned end, ArrayRef<Value> values) {
-    assert(end <= getNumVars() && "invalid end position");
-    assert(start <= end && "invalid start position");
-    assert(values.size() == end - start &&
-           "value should be provided for each variable in the range.");
-    for (unsigned i = start; i < end; ++i)
-      setValue(i, values[i - start]);
-  }
-
-  /// Merge and align symbols of `this` and `other` such that both get union of
-  /// of symbols that are unique. Symbols in `this` and `other` should be
-  /// unique. Symbols with Value as `None` are considered to be inequal to all
-  /// other symbols.
-  void mergeSymbolVars(FlatAffineValueConstraints &other);
-
-protected:
-  using VarKind = presburger::VarKind;
-
-  /// Returns false if the fields corresponding to various variable counts, or
-  /// equality/inequality buffer sizes aren't consistent; true otherwise. This
-  /// is meant to be used within an assert internally.
-  bool hasConsistentState() const override;
-
-  /// Given an affine map that is aligned with this constraint system:
-  /// * Flatten the map.
-  /// * Add newly introduced local columns at the beginning of this constraint
-  ///   system (local column pos 0).
-  /// * Add equalities that define the new local columns to this constraint
-  ///   system.
-  /// * Return the flattened expressions via `flattenedExprs`.
-  ///
-  /// Note: This is a shared helper function of `addLowerOrUpperBound` and
-  ///       `composeMatchingMap`.
-  LogicalResult flattenAlignedMapAndMergeLocals(
-      AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs);
-
-  /// Eliminates the variable at the specified position using Fourier-Motzkin
-  /// variable elimination, but uses Gaussian elimination if there is an
-  /// equality involving that variable. If the result of the elimination is
-  /// integer exact, `*isResultIntegerExact` is set to true. If `darkShadow` is
-  /// set to true, a potential under approximation (subset) of the rational
-  /// shadow / exact integer shadow is computed.
-  // See implementation comments for more details.
-  void fourierMotzkinEliminate(unsigned pos, bool darkShadow = false,
-                               bool *isResultIntegerExact = nullptr) override;
-
-  /// Prints the number of constraints, dimensions, symbols and locals in the
-  /// FlatAffineConstraints. Also, prints for each variable whether there is
-  /// an SSA Value attached to it.
-  void printSpace(raw_ostream &os) const override;
-
-  /// Values corresponding to the (column) non-local variables of this
-  /// constraint system appearing in the order the variables correspond to
-  /// columns. Variables that aren't associated with any Value are set to
-  /// None.
-  SmallVector<std::optional<Value>, 8> values;
 };
 
 /// A FlatAffineRelation represents a set of ordered pairs (domain -> range)
@@ -570,6 +181,13 @@ class FlatAffineRelation : public FlatAffineValueConstraints {
       : FlatAffineValueConstraints(fac), numDomainDims(numDomainDims),
         numRangeDims(numRangeDims) {}
 
+  /// Return the kind of this object.
+  Kind getKind() const override { return Kind::FlatAffineRelation; }
+
+  static bool classof(const IntegerRelation *cst) {
+    return cst->getKind() == Kind::FlatAffineRelation;
+  }
+
   /// Returns a set corresponding to the domain/range of the affine relation.
   FlatAffineValueConstraints getDomainSet() const;
   FlatAffineValueConstraints getRangeSet() const;
@@ -616,66 +234,6 @@ class FlatAffineRelation : public FlatAffineValueConstraints {
   unsigned numRangeDims;
 };
 
-/// Flattens 'expr' into 'flattenedExpr', which contains the coefficients of the
-/// dimensions, symbols, and additional variables that represent floor divisions
-/// of dimensions, symbols, and in turn other floor divisions.  Returns failure
-/// if 'expr' could not be flattened (i.e., semi-affine is not yet handled).
-/// 'cst' contains constraints that connect newly introduced local variables
-/// to existing dimensional and symbolic variables. See documentation for
-/// AffineExprFlattener on how mod's and div's are flattened.
-LogicalResult getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
-                                     unsigned numSymbols,
-                                     SmallVectorImpl<int64_t> *flattenedExpr,
-                                     FlatAffineValueConstraints *cst = nullptr);
-
-/// Flattens the result expressions of the map to their corresponding flattened
-/// forms and set in 'flattenedExprs'. Returns failure if any expression in the
-/// map could not be flattened (i.e., semi-affine is not yet handled). 'cst'
-/// contains constraints that connect newly introduced local variables to
-/// existing dimensional and / symbolic variables. See documentation for
-/// AffineExprFlattener on how mod's and div's are flattened. For all affine
-/// expressions that share the same operands (like those of an affine map), this
-/// method should be used instead of repeatedly calling getFlattenedAffineExpr
-/// since local variables added to deal with div's and mod's will be reused
-/// across expressions.
-LogicalResult
-getFlattenedAffineExprs(AffineMap map,
-                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
-                        FlatAffineValueConstraints *cst = nullptr);
-LogicalResult
-getFlattenedAffineExprs(IntegerSet set,
-                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
-                        FlatAffineValueConstraints *cst = nullptr);
-
-LogicalResult
-getMultiAffineFunctionFromMap(AffineMap map,
-                              presburger::MultiAffineFunction &multiAff);
-
-/// Re-indexes the dimensions and symbols of an affine map with given `operands`
-/// values to align with `dims` and `syms` values.
-///
-/// Each dimension/symbol of the map, bound to an operand `o`, is replaced with
-/// dimension `i`, where `i` is the position of `o` within `dims`. If `o` is not
-/// in `dims`, replace it with symbol `i`, where `i` is the position of `o`
-/// within `syms`. If `o` is not in `syms` either, replace it with a new symbol.
-///
-/// Note: If a value appears multiple times as a dimension/symbol (or both), all
-/// corresponding dim/sym expressions are replaced with the first dimension
-/// bound to that value (or first symbol if no such dimension exists).
-///
-/// The resulting affine map has `dims.size()` many dimensions and at least
-/// `syms.size()` many symbols.
-///
-/// The SSA values of the symbols of the resulting map are optionally returned
-/// via `newSyms`. This is a concatenation of `syms` with the SSA values of the
-/// newly added symbols.
-///
-/// Note: As part of this re-indexing, dimensions may turn into symbols, or vice
-/// versa.
-AffineMap alignAffineMapWithValues(AffineMap map, ValueRange operands,
-                                   ValueRange dims, ValueRange syms,
-                                   SmallVector<Value> *newSyms = nullptr);
-
 /// Builds a relation from the given AffineMap/AffineValueMap `map`, containing
 /// all pairs of the form `operands -> result` that satisfy `map`. `rel` is set
 /// to the relation built. For example, give the AffineMap:
@@ -696,6 +254,6 @@ LogicalResult getRelationFromMap(AffineMap &map, FlatAffineRelation &rel);
 LogicalResult getRelationFromMap(const AffineValueMap &map,
                                  FlatAffineRelation &rel);
 
-} // namespace mlir.
+} // namespace mlir
 
 #endif // MLIR_DIALECT_AFFINE_ANALYSIS_AFFINESTRUCTURES_H
diff --git a/mlir/include/mlir/IR/AffineExprVisitor.h b/mlir/include/mlir/IR/AffineExprVisitor.h
index 30ee1b6e0819c..f6216614c2238 100644
--- a/mlir/include/mlir/IR/AffineExprVisitor.h
+++ b/mlir/include/mlir/IR/AffineExprVisitor.h
@@ -324,7 +324,7 @@ class SimpleAffineExprFlattener
   // A floordiv is thus flattened by introducing a new local variable q, and
   // replacing that expression with 'q' while adding the constraints
   // c * q <= expr <= c * q + c - 1 to localVarCst (done by
-  // FlatAffineConstraints::addLocalFloorDiv).
+  // IntegerRelation::addLocalFloorDiv).
   //
   // A ceildiv is similarly flattened:
   // t = expr ceildiv c   <=> t =  (expr + c - 1) floordiv c
diff --git a/mlir/include/mlir/IR/IntegerSet.h b/mlir/include/mlir/IR/IntegerSet.h
index b8affcae74e6e..f814776f1ee7f 100644
--- a/mlir/include/mlir/IR/IntegerSet.h
+++ b/mlir/include/mlir/IR/IntegerSet.h
@@ -17,7 +17,7 @@
 
 // This class is not meant for affine analysis and operations like set
 // operations, emptiness checks, or other math operations for analysis and
-// transformation. For the latter, use FlatAffineConstraints.
+// transformation. For the latter, use FlatAffineValueConstraints.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt
index 25263db944e97..b68e03c5748fc 100644
--- a/mlir/lib/Analysis/CMakeLists.txt
+++ b/mlir/lib/Analysis/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_OPTIONAL_SOURCES
   AliasAnalysis.cpp
   CallGraph.cpp
   DataLayoutAnalysis.cpp
+  FlatLinearValueConstraints.cpp
   Liveness.cpp
   SliceAnalysis.cpp
 
@@ -14,11 +15,14 @@ set(LLVM_OPTIONAL_SOURCES
   DataFlow/SparseAnalysis.cpp
   )
 
+add_subdirectory(Presburger)
+
 add_mlir_library(MLIRAnalysis
   AliasAnalysis.cpp
   CallGraph.cpp
   DataFlowFramework.cpp
   DataLayoutAnalysis.cpp
+  FlatLinearValueConstraints.cpp
   Liveness.cpp
   SliceAnalysis.cpp
 
@@ -43,8 +47,8 @@ add_mlir_library(MLIRAnalysis
   MLIRInferIntRangeInterface
   MLIRInferTypeOpInterface
   MLIRLoopLikeInterface
+  MLIRPresburger
   MLIRSideEffectInterfaces
   MLIRViewLikeInterface
   )
 
-add_subdirectory(Presburger)
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
new file mode 100644
index 0000000000000..b89b2d11003af
--- /dev/null
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -0,0 +1,1344 @@
+//===- FlatLinearValueConstraints.cpp - Linear Constraint -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis//FlatLinearValueConstraints.h"
+
+#include "mlir/Analysis/Presburger/LinearTransform.h"
+#include "mlir/Analysis/Presburger/Simplex.h"
+#include "mlir/Analysis/Presburger/Utils.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+#define DEBUG_TYPE "flat-value-constraints"
+
+using namespace mlir;
+using namespace presburger;
+
+//===----------------------------------------------------------------------===//
+// AffineExprFlattener
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// See comments for SimpleAffineExprFlattener.
+// An AffineExprFlattener extends a SimpleAffineExprFlattener by recording
+// constraint information associated with mod's, floordiv's, and ceildiv's
+// in FlatLinearConstraints 'localVarCst'.
+struct AffineExprFlattener : public SimpleAffineExprFlattener {
+public:
+  // Constraints connecting newly introduced local variables (for mod's and
+  // div's) to existing (dimensional and symbolic) ones. These are always
+  // inequalities.
+  IntegerPolyhedron localVarCst;
+
+  AffineExprFlattener(unsigned nDims, unsigned nSymbols)
+      : SimpleAffineExprFlattener(nDims, nSymbols),
+        localVarCst(PresburgerSpace::getSetSpace(nDims, nSymbols)) {}
+
+private:
+  // Add a local variable (needed to flatten a mod, floordiv, ceildiv expr).
+  // The local variable added is always a floordiv of a pure add/mul affine
+  // function of other variables, coefficients of which are specified in
+  // `dividend' and with respect to the positive constant `divisor'. localExpr
+  // is the simplified tree expression (AffineExpr) corresponding to the
+  // quantifier.
+  void addLocalFloorDivId(ArrayRef<int64_t> dividend, int64_t divisor,
+                          AffineExpr localExpr) override {
+    SimpleAffineExprFlattener::addLocalFloorDivId(dividend, divisor, localExpr);
+    // Update localVarCst.
+    localVarCst.addLocalFloorDiv(dividend, divisor);
+  }
+};
+
+} // namespace
+
+// Flattens the expressions in map. Returns failure if 'expr' was unable to be
+// flattened (i.e., semi-affine expressions not handled yet).
+static LogicalResult
+getFlattenedAffineExprs(ArrayRef<AffineExpr> exprs, unsigned numDims,
+                        unsigned numSymbols,
+                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+                        FlatLinearConstraints *localVarCst) {
+  if (exprs.empty()) {
+    if (localVarCst)
+      *localVarCst = FlatLinearConstraints(numDims, numSymbols);
+    return success();
+  }
+
+  AffineExprFlattener flattener(numDims, numSymbols);
+  // Use the same flattener to simplify each expression successively. This way
+  // local variables / expressions are shared.
+  for (auto expr : exprs) {
+    if (!expr.isPureAffine())
+      return failure();
+
+    flattener.walkPostOrder(expr);
+  }
+
+  assert(flattener.operandExprStack.size() == exprs.size());
+  flattenedExprs->clear();
+  flattenedExprs->assign(flattener.operandExprStack.begin(),
+                         flattener.operandExprStack.end());
+
+  if (localVarCst)
+    localVarCst->clearAndCopyFrom(flattener.localVarCst);
+
+  return success();
+}
+
+// Flattens 'expr' into 'flattenedExpr'. Returns failure if 'expr' was unable to
+// be flattened (semi-affine expressions not handled yet).
+LogicalResult
+mlir::getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
+                             unsigned numSymbols,
+                             SmallVectorImpl<int64_t> *flattenedExpr,
+                             FlatLinearConstraints *localVarCst) {
+  std::vector<SmallVector<int64_t, 8>> flattenedExprs;
+  LogicalResult ret = ::getFlattenedAffineExprs({expr}, numDims, numSymbols,
+                                                &flattenedExprs, localVarCst);
+  *flattenedExpr = flattenedExprs[0];
+  return ret;
+}
+
+/// Flattens the expressions in map. Returns failure if 'expr' was unable to be
+/// flattened (i.e., semi-affine expressions not handled yet).
+LogicalResult mlir::getFlattenedAffineExprs(
+    AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatLinearConstraints *localVarCst) {
+  if (map.getNumResults() == 0) {
+    if (localVarCst)
+      *localVarCst =
+          FlatLinearConstraints(map.getNumDims(), map.getNumSymbols());
+    return success();
+  }
+  return ::getFlattenedAffineExprs(map.getResults(), map.getNumDims(),
+                                   map.getNumSymbols(), flattenedExprs,
+                                   localVarCst);
+}
+
+LogicalResult mlir::getFlattenedAffineExprs(
+    IntegerSet set, std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatLinearConstraints *localVarCst) {
+  if (set.getNumConstraints() == 0) {
+    if (localVarCst)
+      *localVarCst =
+          FlatLinearConstraints(set.getNumDims(), set.getNumSymbols());
+    return success();
+  }
+  return ::getFlattenedAffineExprs(set.getConstraints(), set.getNumDims(),
+                                   set.getNumSymbols(), flattenedExprs,
+                                   localVarCst);
+}
+
+//===----------------------------------------------------------------------===//
+// FlatLinearConstraints
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<FlatLinearConstraints> FlatLinearConstraints::clone() const {
+  return std::make_unique<FlatLinearConstraints>(*this);
+}
+
+// Similar to `composeMap` except that no Values need be associated with the
+// constraint system nor are they looked at -- the dimensions and symbols of
+// `other` are expected to correspond 1:1 to `this` system.
+LogicalResult FlatLinearConstraints::composeMatchingMap(AffineMap other) {
+  assert(other.getNumDims() == getNumDimVars() && "dim mismatch");
+  assert(other.getNumSymbols() == getNumSymbolVars() && "symbol mismatch");
+
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  if (failed(flattenAlignedMapAndMergeLocals(other, &flatExprs)))
+    return failure();
+  assert(flatExprs.size() == other.getNumResults());
+
+  // Add dimensions corresponding to the map's results.
+  insertDimVar(/*pos=*/0, /*num=*/other.getNumResults());
+
+  // We add one equality for each result connecting the result dim of the map to
+  // the other variables.
+  // E.g.: if the expression is 16*i0 + i1, and this is the r^th
+  // iteration/result of the value map, we are adding the equality:
+  // d_r - 16*i0 - i1 = 0. Similarly, when flattening (i0 + 1, i0 + 8*i2), we
+  // add two equalities: d_0 - i0 - 1 == 0, d1 - i0 - 8*i2 == 0.
+  for (unsigned r = 0, e = flatExprs.size(); r < e; r++) {
+    const auto &flatExpr = flatExprs[r];
+    assert(flatExpr.size() >= other.getNumInputs() + 1);
+
+    SmallVector<int64_t, 8> eqToAdd(getNumCols(), 0);
+    // Set the coefficient for this result to one.
+    eqToAdd[r] = 1;
+
+    // Dims and symbols.
+    for (unsigned i = 0, f = other.getNumInputs(); i < f; i++) {
+      // Negate `eq[r]` since the newly added dimension will be set to this one.
+      eqToAdd[e + i] = -flatExpr[i];
+    }
+    // Local columns of `eq` are at the beginning.
+    unsigned j = getNumDimVars() + getNumSymbolVars();
+    unsigned end = flatExpr.size() - 1;
+    for (unsigned i = other.getNumInputs(); i < end; i++, j++) {
+      eqToAdd[j] = -flatExpr[i];
+    }
+
+    // Constant term.
+    eqToAdd[getNumCols() - 1] = -flatExpr[flatExpr.size() - 1];
+
+    // Add the equality connecting the result of the map to this constraint set.
+    addEquality(eqToAdd);
+  }
+
+  return success();
+}
+
+// Determine whether the variable at 'pos' (say var_r) can be expressed as
+// modulo of another known variable (say var_n) w.r.t a constant. For example,
+// if the following constraints hold true:
+// ```
+// 0 <= var_r <= divisor - 1
+// var_n - (divisor * q_expr) = var_r
+// ```
+// where `var_n` is a known variable (called dividend), and `q_expr` is an
+// `AffineExpr` (called the quotient expression), `var_r` can be written as:
+//
+// `var_r = var_n mod divisor`.
+//
+// Additionally, in a special case of the above constaints where `q_expr` is an
+// variable itself that is not yet known (say `var_q`), it can be written as a
+// floordiv in the following way:
+//
+// `var_q = var_n floordiv divisor`.
+//
+// Returns true if the above mod or floordiv are detected, updating 'memo' with
+// these new expressions. Returns false otherwise.
+static bool detectAsMod(const FlatLinearConstraints &cst, unsigned pos,
+                        int64_t lbConst, int64_t ubConst,
+                        SmallVectorImpl<AffineExpr> &memo,
+                        MLIRContext *context) {
+  assert(pos < cst.getNumVars() && "invalid position");
+
+  // Check if a divisor satisfying the condition `0 <= var_r <= divisor - 1` can
+  // be determined.
+  if (lbConst != 0 || ubConst < 1)
+    return false;
+  int64_t divisor = ubConst + 1;
+
+  // Check for the aforementioned conditions in each equality.
+  for (unsigned curEquality = 0, numEqualities = cst.getNumEqualities();
+       curEquality < numEqualities; curEquality++) {
+    int64_t coefficientAtPos = cst.atEq64(curEquality, pos);
+    // If current equality does not involve `var_r`, continue to the next
+    // equality.
+    if (coefficientAtPos == 0)
+      continue;
+
+    // Constant term should be 0 in this equality.
+    if (cst.atEq64(curEquality, cst.getNumCols() - 1) != 0)
+      continue;
+
+    // Traverse through the equality and construct the dividend expression
+    // `dividendExpr`, to contain all the variables which are known and are
+    // not divisible by `(coefficientAtPos * divisor)`. Hope here is that the
+    // `dividendExpr` gets simplified into a single variable `var_n` discussed
+    // above.
+    auto dividendExpr = getAffineConstantExpr(0, context);
+
+    // Track the terms that go into quotient expression, later used to detect
+    // additional floordiv.
+    unsigned quotientCount = 0;
+    int quotientPosition = -1;
+    int quotientSign = 1;
+
+    // Consider each term in the current equality.
+    unsigned curVar, e;
+    for (curVar = 0, e = cst.getNumDimAndSymbolVars(); curVar < e; ++curVar) {
+      // Ignore var_r.
+      if (curVar == pos)
+        continue;
+      int64_t coefficientOfCurVar = cst.atEq64(curEquality, curVar);
+      // Ignore vars that do not contribute to the current equality.
+      if (coefficientOfCurVar == 0)
+        continue;
+      // Check if the current var goes into the quotient expression.
+      if (coefficientOfCurVar % (divisor * coefficientAtPos) == 0) {
+        quotientCount++;
+        quotientPosition = curVar;
+        quotientSign = (coefficientOfCurVar * coefficientAtPos) > 0 ? 1 : -1;
+        continue;
+      }
+      // Variables that are part of dividendExpr should be known.
+      if (!memo[curVar])
+        break;
+      // Append the current variable to the dividend expression.
+      dividendExpr = dividendExpr + memo[curVar] * coefficientOfCurVar;
+    }
+
+    // Can't construct expression as it depends on a yet uncomputed var.
+    if (curVar < e)
+      continue;
+
+    // Express `var_r` in terms of the other vars collected so far.
+    if (coefficientAtPos > 0)
+      dividendExpr = (-dividendExpr).floorDiv(coefficientAtPos);
+    else
+      dividendExpr = dividendExpr.floorDiv(-coefficientAtPos);
+
+    // Simplify the expression.
+    dividendExpr = simplifyAffineExpr(dividendExpr, cst.getNumDimVars(),
+                                      cst.getNumSymbolVars());
+    // Only if the final dividend expression is just a single var (which we call
+    // `var_n`), we can proceed.
+    // TODO: Handle AffineSymbolExpr as well. There is no reason to restrict it
+    // to dims themselves.
+    auto dimExpr = dividendExpr.dyn_cast<AffineDimExpr>();
+    if (!dimExpr)
+      continue;
+
+    // Express `var_r` as `var_n % divisor` and store the expression in `memo`.
+    if (quotientCount >= 1) {
+      auto ub = cst.getConstantBound64(FlatLinearConstraints::BoundType::UB,
+                                       dimExpr.getPosition());
+      // If `var_n` has an upperbound that is less than the divisor, mod can be
+      // eliminated altogether.
+      if (ub && *ub < divisor)
+        memo[pos] = dimExpr;
+      else
+        memo[pos] = dimExpr % divisor;
+      // If a unique quotient `var_q` was seen, it can be expressed as
+      // `var_n floordiv divisor`.
+      if (quotientCount == 1 && !memo[quotientPosition])
+        memo[quotientPosition] = dimExpr.floorDiv(divisor) * quotientSign;
+
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Check if the pos^th variable can be expressed as a floordiv of an affine
+/// function of other variables (where the divisor is a positive constant)
+/// given the initial set of expressions in `exprs`. If it can be, the
+/// corresponding position in `exprs` is set as the detected affine expr. For
+/// eg: 4q <= i + j <= 4q + 3   <=>   q = (i + j) floordiv 4. An equality can
+/// also yield a floordiv: eg.  4q = i + j <=> q = (i + j) floordiv 4. 32q + 28
+/// <= i <= 32q + 31 => q = i floordiv 32.
+static bool detectAsFloorDiv(const FlatLinearConstraints &cst, unsigned pos,
+                             MLIRContext *context,
+                             SmallVectorImpl<AffineExpr> &exprs) {
+  assert(pos < cst.getNumVars() && "invalid position");
+
+  // Get upper-lower bound pair for this variable.
+  SmallVector<bool, 8> foundRepr(cst.getNumVars(), false);
+  for (unsigned i = 0, e = cst.getNumVars(); i < e; ++i)
+    if (exprs[i])
+      foundRepr[i] = true;
+
+  SmallVector<int64_t, 8> dividend(cst.getNumCols());
+  unsigned divisor;
+  auto ulPair = computeSingleVarRepr(cst, foundRepr, pos, dividend, divisor);
+
+  // No upper-lower bound pair found for this var.
+  if (ulPair.kind == ReprKind::None || ulPair.kind == ReprKind::Equality)
+    return false;
+
+  // Construct the dividend expression.
+  auto dividendExpr = getAffineConstantExpr(dividend.back(), context);
+  for (unsigned c = 0, f = cst.getNumVars(); c < f; c++)
+    if (dividend[c] != 0)
+      dividendExpr = dividendExpr + dividend[c] * exprs[c];
+
+  // Successfully detected the floordiv.
+  exprs[pos] = dividendExpr.floorDiv(divisor);
+  return true;
+}
+
+std::pair<AffineMap, AffineMap> FlatLinearConstraints::getLowerAndUpperBound(
+    unsigned pos, unsigned offset, unsigned num, unsigned symStartPos,
+    ArrayRef<AffineExpr> localExprs, MLIRContext *context,
+    bool closedUB) const {
+  assert(pos + offset < getNumDimVars() && "invalid dim start pos");
+  assert(symStartPos >= (pos + offset) && "invalid sym start pos");
+  assert(getNumLocalVars() == localExprs.size() &&
+         "incorrect local exprs count");
+
+  SmallVector<unsigned, 4> lbIndices, ubIndices, eqIndices;
+  getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices, &eqIndices,
+                               offset, num);
+
+  /// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
+  auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
+    b.clear();
+    for (unsigned i = 0, e = a.size(); i < e; ++i) {
+      if (i < offset || i >= offset + num)
+        b.push_back(a[i]);
+    }
+  };
+
+  SmallVector<int64_t, 8> lb, ub;
+  SmallVector<AffineExpr, 4> lbExprs;
+  unsigned dimCount = symStartPos - num;
+  unsigned symCount = getNumDimAndSymbolVars() - symStartPos;
+  lbExprs.reserve(lbIndices.size() + eqIndices.size());
+  // Lower bound expressions.
+  for (auto idx : lbIndices) {
+    auto ineq = getInequality64(idx);
+    // Extract the lower bound (in terms of other coeff's + const), i.e., if
+    // i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
+    // - 1.
+    addCoeffs(ineq, lb);
+    std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
+    auto expr =
+        getAffineExprFromFlatForm(lb, dimCount, symCount, localExprs, context);
+    // expr ceildiv divisor is (expr + divisor - 1) floordiv divisor
+    int64_t divisor = std::abs(ineq[pos + offset]);
+    expr = (expr + divisor - 1).floorDiv(divisor);
+    lbExprs.push_back(expr);
+  }
+
+  SmallVector<AffineExpr, 4> ubExprs;
+  ubExprs.reserve(ubIndices.size() + eqIndices.size());
+  // Upper bound expressions.
+  for (auto idx : ubIndices) {
+    auto ineq = getInequality64(idx);
+    // Extract the upper bound (in terms of other coeff's + const).
+    addCoeffs(ineq, ub);
+    auto expr =
+        getAffineExprFromFlatForm(ub, dimCount, symCount, localExprs, context);
+    expr = expr.floorDiv(std::abs(ineq[pos + offset]));
+    int64_t ubAdjustment = closedUB ? 0 : 1;
+    ubExprs.push_back(expr + ubAdjustment);
+  }
+
+  // Equalities. It's both a lower and a upper bound.
+  SmallVector<int64_t, 4> b;
+  for (auto idx : eqIndices) {
+    auto eq = getEquality64(idx);
+    addCoeffs(eq, b);
+    if (eq[pos + offset] > 0)
+      std::transform(b.begin(), b.end(), b.begin(), std::negate<int64_t>());
+
+    // Extract the upper bound (in terms of other coeff's + const).
+    auto expr =
+        getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
+    expr = expr.floorDiv(std::abs(eq[pos + offset]));
+    // Upper bound is exclusive.
+    ubExprs.push_back(expr + 1);
+    // Lower bound.
+    expr =
+        getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
+    expr = expr.ceilDiv(std::abs(eq[pos + offset]));
+    lbExprs.push_back(expr);
+  }
+
+  auto lbMap = AffineMap::get(dimCount, symCount, lbExprs, context);
+  auto ubMap = AffineMap::get(dimCount, symCount, ubExprs, context);
+
+  return {lbMap, ubMap};
+}
+
+/// Computes the lower and upper bounds of the first 'num' dimensional
+/// variables (starting at 'offset') as affine maps of the remaining
+/// variables (dimensional and symbolic variables). Local variables are
+/// themselves explicitly computed as affine functions of other variables in
+/// this process if needed.
+void FlatLinearConstraints::getSliceBounds(unsigned offset, unsigned num,
+                                           MLIRContext *context,
+                                           SmallVectorImpl<AffineMap> *lbMaps,
+                                           SmallVectorImpl<AffineMap> *ubMaps,
+                                           bool closedUB) {
+  assert(num < getNumDimVars() && "invalid range");
+
+  // Basic simplification.
+  normalizeConstraintsByGCD();
+
+  LLVM_DEBUG(llvm::dbgs() << "getSliceBounds for first " << num
+                          << " variables\n");
+  LLVM_DEBUG(dump());
+
+  // Record computed/detected variables.
+  SmallVector<AffineExpr, 8> memo(getNumVars());
+  // Initialize dimensional and symbolic variables.
+  for (unsigned i = 0, e = getNumDimVars(); i < e; i++) {
+    if (i < offset)
+      memo[i] = getAffineDimExpr(i, context);
+    else if (i >= offset + num)
+      memo[i] = getAffineDimExpr(i - num, context);
+  }
+  for (unsigned i = getNumDimVars(), e = getNumDimAndSymbolVars(); i < e; i++)
+    memo[i] = getAffineSymbolExpr(i - getNumDimVars(), context);
+
+  bool changed;
+  do {
+    changed = false;
+    // Identify yet unknown variables as constants or mod's / floordiv's of
+    // other variables if possible.
+    for (unsigned pos = 0; pos < getNumVars(); pos++) {
+      if (memo[pos])
+        continue;
+
+      auto lbConst = getConstantBound64(BoundType::LB, pos);
+      auto ubConst = getConstantBound64(BoundType::UB, pos);
+      if (lbConst.has_value() && ubConst.has_value()) {
+        // Detect equality to a constant.
+        if (*lbConst == *ubConst) {
+          memo[pos] = getAffineConstantExpr(*lbConst, context);
+          changed = true;
+          continue;
+        }
+
+        // Detect a variable as modulo of another variable w.r.t a
+        // constant.
+        if (detectAsMod(*this, pos, *lbConst, *ubConst, memo, context)) {
+          changed = true;
+          continue;
+        }
+      }
+
+      // Detect a variable as a floordiv of an affine function of other
+      // variables (divisor is a positive constant).
+      if (detectAsFloorDiv(*this, pos, context, memo)) {
+        changed = true;
+        continue;
+      }
+
+      // Detect a variable as an expression of other variables.
+      unsigned idx;
+      if (!findConstraintWithNonZeroAt(pos, /*isEq=*/true, &idx)) {
+        continue;
+      }
+
+      // Build AffineExpr solving for variable 'pos' in terms of all others.
+      auto expr = getAffineConstantExpr(0, context);
+      unsigned j, e;
+      for (j = 0, e = getNumVars(); j < e; ++j) {
+        if (j == pos)
+          continue;
+        int64_t c = atEq64(idx, j);
+        if (c == 0)
+          continue;
+        // If any of the involved IDs hasn't been found yet, we can't proceed.
+        if (!memo[j])
+          break;
+        expr = expr + memo[j] * c;
+      }
+      if (j < e)
+        // Can't construct expression as it depends on a yet uncomputed
+        // variable.
+        continue;
+
+      // Add constant term to AffineExpr.
+      expr = expr + atEq64(idx, getNumVars());
+      int64_t vPos = atEq64(idx, pos);
+      assert(vPos != 0 && "expected non-zero here");
+      if (vPos > 0)
+        expr = (-expr).floorDiv(vPos);
+      else
+        // vPos < 0.
+        expr = expr.floorDiv(-vPos);
+      // Successfully constructed expression.
+      memo[pos] = expr;
+      changed = true;
+    }
+    // This loop is guaranteed to reach a fixed point - since once an
+    // variable's explicit form is computed (in memo[pos]), it's not updated
+    // again.
+  } while (changed);
+
+  int64_t ubAdjustment = closedUB ? 0 : 1;
+
+  // Set the lower and upper bound maps for all the variables that were
+  // computed as affine expressions of the rest as the "detected expr" and
+  // "detected expr + 1" respectively; set the undetected ones to null.
+  std::optional<FlatLinearConstraints> tmpClone;
+  for (unsigned pos = 0; pos < num; pos++) {
+    unsigned numMapDims = getNumDimVars() - num;
+    unsigned numMapSymbols = getNumSymbolVars();
+    AffineExpr expr = memo[pos + offset];
+    if (expr)
+      expr = simplifyAffineExpr(expr, numMapDims, numMapSymbols);
+
+    AffineMap &lbMap = (*lbMaps)[pos];
+    AffineMap &ubMap = (*ubMaps)[pos];
+
+    if (expr) {
+      lbMap = AffineMap::get(numMapDims, numMapSymbols, expr);
+      ubMap = AffineMap::get(numMapDims, numMapSymbols, expr + ubAdjustment);
+    } else {
+      // TODO: Whenever there are local variables in the dependence
+      // constraints, we'll conservatively over-approximate, since we don't
+      // always explicitly compute them above (in the while loop).
+      if (getNumLocalVars() == 0) {
+        // Work on a copy so that we don't update this constraint system.
+        if (!tmpClone) {
+          tmpClone.emplace(FlatLinearConstraints(*this));
+          // Removing redundant inequalities is necessary so that we don't get
+          // redundant loop bounds.
+          tmpClone->removeRedundantInequalities();
+        }
+        std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
+            pos, offset, num, getNumDimVars(), /*localExprs=*/{}, context,
+            closedUB);
+      }
+
+      // If the above fails, we'll just use the constant lower bound and the
+      // constant upper bound (if they exist) as the slice bounds.
+      // TODO: being conservative for the moment in cases that
+      // lead to multiple bounds - until getConstDifference in LoopFusion.cpp is
+      // fixed (b/126426796).
+      if (!lbMap || lbMap.getNumResults() > 1) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "WARNING: Potentially over-approximating slice lb\n");
+        auto lbConst = getConstantBound64(BoundType::LB, pos + offset);
+        if (lbConst.has_value()) {
+          lbMap = AffineMap::get(numMapDims, numMapSymbols,
+                                 getAffineConstantExpr(*lbConst, context));
+        }
+      }
+      if (!ubMap || ubMap.getNumResults() > 1) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "WARNING: Potentially over-approximating slice ub\n");
+        auto ubConst = getConstantBound64(BoundType::UB, pos + offset);
+        if (ubConst.has_value()) {
+          ubMap = AffineMap::get(
+              numMapDims, numMapSymbols,
+              getAffineConstantExpr(*ubConst + ubAdjustment, context));
+        }
+      }
+    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "lb map for pos = " << Twine(pos + offset) << ", expr: ");
+    LLVM_DEBUG(lbMap.dump(););
+    LLVM_DEBUG(llvm::dbgs()
+               << "ub map for pos = " << Twine(pos + offset) << ", expr: ");
+    LLVM_DEBUG(ubMap.dump(););
+  }
+}
+
+LogicalResult FlatLinearConstraints::flattenAlignedMapAndMergeLocals(
+    AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs) {
+  FlatLinearConstraints localCst;
+  if (failed(getFlattenedAffineExprs(map, flattenedExprs, &localCst))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "composition unimplemented for semi-affine maps\n");
+    return failure();
+  }
+
+  // Add localCst information.
+  if (localCst.getNumLocalVars() > 0) {
+    unsigned numLocalVars = getNumLocalVars();
+    // Insert local dims of localCst at the beginning.
+    insertLocalVar(/*pos=*/0, /*num=*/localCst.getNumLocalVars());
+    // Insert local dims of `this` at the end of localCst.
+    localCst.appendLocalVar(/*num=*/numLocalVars);
+    // Dimensions of localCst and this constraint set match. Append localCst to
+    // this constraint set.
+    append(localCst);
+  }
+
+  return success();
+}
+
+LogicalResult FlatLinearConstraints::addBound(BoundType type, unsigned pos,
+                                              AffineMap boundMap,
+                                              bool isClosedBound) {
+  assert(boundMap.getNumDims() == getNumDimVars() && "dim mismatch");
+  assert(boundMap.getNumSymbols() == getNumSymbolVars() && "symbol mismatch");
+  assert(pos < getNumDimAndSymbolVars() && "invalid position");
+  assert((type != BoundType::EQ || isClosedBound) &&
+         "EQ bound must be closed.");
+
+  // Equality follows the logic of lower bound except that we add an equality
+  // instead of an inequality.
+  assert((type != BoundType::EQ || boundMap.getNumResults() == 1) &&
+         "single result expected");
+  bool lower = type == BoundType::LB || type == BoundType::EQ;
+
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  if (failed(flattenAlignedMapAndMergeLocals(boundMap, &flatExprs)))
+    return failure();
+  assert(flatExprs.size() == boundMap.getNumResults());
+
+  // Add one (in)equality for each result.
+  for (const auto &flatExpr : flatExprs) {
+    SmallVector<int64_t> ineq(getNumCols(), 0);
+    // Dims and symbols.
+    for (unsigned j = 0, e = boundMap.getNumInputs(); j < e; j++) {
+      ineq[j] = lower ? -flatExpr[j] : flatExpr[j];
+    }
+    // Invalid bound: pos appears in `boundMap`.
+    // TODO: This should be an assertion. Fix `addDomainFromSliceMaps` and/or
+    // its callers to prevent invalid bounds from being added.
+    if (ineq[pos] != 0)
+      continue;
+    ineq[pos] = lower ? 1 : -1;
+    // Local columns of `ineq` are at the beginning.
+    unsigned j = getNumDimVars() + getNumSymbolVars();
+    unsigned end = flatExpr.size() - 1;
+    for (unsigned i = boundMap.getNumInputs(); i < end; i++, j++) {
+      ineq[j] = lower ? -flatExpr[i] : flatExpr[i];
+    }
+    // Make the bound closed in if flatExpr is open. The inequality is always
+    // created in the upper bound form, so the adjustment is -1.
+    int64_t boundAdjustment = (isClosedBound || type == BoundType::EQ) ? 0 : -1;
+    // Constant term.
+    ineq[getNumCols() - 1] = (lower ? -flatExpr[flatExpr.size() - 1]
+                                    : flatExpr[flatExpr.size() - 1]) +
+                             boundAdjustment;
+    type == BoundType::EQ ? addEquality(ineq) : addInequality(ineq);
+  }
+
+  return success();
+}
+
+LogicalResult FlatLinearConstraints::addBound(BoundType type, unsigned pos,
+                                              AffineMap boundMap) {
+  return addBound(type, pos, boundMap, /*isClosedBound=*/type != BoundType::UB);
+}
+
+/// Compute an explicit representation for local vars. For all systems coming
+/// from MLIR integer sets, maps, or expressions where local vars were
+/// introduced to model floordivs and mods, this always succeeds.
+LogicalResult
+FlatLinearConstraints::computeLocalVars(SmallVectorImpl<AffineExpr> &memo,
+                                        MLIRContext *context) const {
+  unsigned numDims = getNumDimVars();
+  unsigned numSyms = getNumSymbolVars();
+
+  // Initialize dimensional and symbolic variables.
+  for (unsigned i = 0; i < numDims; i++)
+    memo[i] = getAffineDimExpr(i, context);
+  for (unsigned i = numDims, e = numDims + numSyms; i < e; i++)
+    memo[i] = getAffineSymbolExpr(i - numDims, context);
+
+  bool changed;
+  do {
+    // Each time `changed` is true at the end of this iteration, one or more
+    // local vars would have been detected as floordivs and set in memo; so the
+    // number of null entries in memo[...] strictly reduces; so this converges.
+    changed = false;
+    for (unsigned i = 0, e = getNumLocalVars(); i < e; ++i)
+      if (!memo[numDims + numSyms + i] &&
+          detectAsFloorDiv(*this, /*pos=*/numDims + numSyms + i, context, memo))
+        changed = true;
+  } while (changed);
+
+  ArrayRef<AffineExpr> localExprs =
+      ArrayRef<AffineExpr>(memo).take_back(getNumLocalVars());
+  return success(
+      llvm::all_of(localExprs, [](AffineExpr expr) { return expr; }));
+}
+
+IntegerSet FlatLinearConstraints::getAsIntegerSet(MLIRContext *context) const {
+  if (getNumConstraints() == 0)
+    // Return universal set (always true): 0 == 0.
+    return IntegerSet::get(getNumDimVars(), getNumSymbolVars(),
+                           getAffineConstantExpr(/*constant=*/0, context),
+                           /*eqFlags=*/true);
+
+  // Construct local references.
+  SmallVector<AffineExpr, 8> memo(getNumVars(), AffineExpr());
+
+  if (failed(computeLocalVars(memo, context))) {
+    // Check if the local variables without an explicit representation have
+    // zero coefficients everywhere.
+    SmallVector<unsigned> noLocalRepVars;
+    unsigned numDimsSymbols = getNumDimAndSymbolVars();
+    for (unsigned i = numDimsSymbols, e = getNumVars(); i < e; ++i) {
+      if (!memo[i] && !isColZero(/*pos=*/i))
+        noLocalRepVars.push_back(i - numDimsSymbols);
+    }
+    if (!noLocalRepVars.empty()) {
+      LLVM_DEBUG({
+        llvm::dbgs() << "local variables at position(s) ";
+        llvm::interleaveComma(noLocalRepVars, llvm::dbgs());
+        llvm::dbgs() << " do not have an explicit representation in:\n";
+        this->dump();
+      });
+      return IntegerSet();
+    }
+  }
+
+  ArrayRef<AffineExpr> localExprs =
+      ArrayRef<AffineExpr>(memo).take_back(getNumLocalVars());
+
+  // Construct the IntegerSet from the equalities/inequalities.
+  unsigned numDims = getNumDimVars();
+  unsigned numSyms = getNumSymbolVars();
+
+  SmallVector<bool, 16> eqFlags(getNumConstraints());
+  std::fill(eqFlags.begin(), eqFlags.begin() + getNumEqualities(), true);
+  std::fill(eqFlags.begin() + getNumEqualities(), eqFlags.end(), false);
+
+  SmallVector<AffineExpr, 8> exprs;
+  exprs.reserve(getNumConstraints());
+
+  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i)
+    exprs.push_back(getAffineExprFromFlatForm(getEquality64(i), numDims,
+                                              numSyms, localExprs, context));
+  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i)
+    exprs.push_back(getAffineExprFromFlatForm(getInequality64(i), numDims,
+                                              numSyms, localExprs, context));
+  return IntegerSet::get(numDims, numSyms, exprs, eqFlags);
+}
+
+//===----------------------------------------------------------------------===//
+// FlatLinearValueConstraints
+//===----------------------------------------------------------------------===//
+
+// Construct from an IntegerSet.
+FlatLinearValueConstraints::FlatLinearValueConstraints(IntegerSet set,
+                                                       ValueRange operands)
+    : FlatLinearConstraints(set.getNumInequalities(), set.getNumEqualities(),
+                            set.getNumDims() + set.getNumSymbols() + 1,
+                            set.getNumDims(), set.getNumSymbols(),
+                            /*numLocals=*/0) {
+  // Populate values.
+  if (operands.empty()) {
+    values.resize(getNumDimAndSymbolVars(), std::nullopt);
+  } else {
+    assert(set.getNumInputs() == operands.size() && "operand count mismatch");
+    values.assign(operands.begin(), operands.end());
+  }
+
+  // Flatten expressions and add them to the constraint system.
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  FlatLinearConstraints localVarCst;
+  if (failed(getFlattenedAffineExprs(set, &flatExprs, &localVarCst))) {
+    assert(false && "flattening unimplemented for semi-affine integer sets");
+    return;
+  }
+  assert(flatExprs.size() == set.getNumConstraints());
+  insertVar(VarKind::Local, getNumVarKind(VarKind::Local),
+            /*num=*/localVarCst.getNumLocalVars());
+
+  for (unsigned i = 0, e = flatExprs.size(); i < e; ++i) {
+    const auto &flatExpr = flatExprs[i];
+    assert(flatExpr.size() == getNumCols());
+    if (set.getEqFlags()[i]) {
+      addEquality(flatExpr);
+    } else {
+      addInequality(flatExpr);
+    }
+  }
+  // Add the other constraints involving local vars from flattening.
+  append(localVarCst);
+}
+
+// Construct a hyperrectangular constraint set from ValueRanges that represent
+// induction variables, lower and upper bounds. `ivs`, `lbs` and `ubs` are
+// expected to match one to one. The order of variables and constraints is:
+//
+// ivs | lbs | ubs | eq/ineq
+// ----+-----+-----+---------
+//   1   -1     0      >= 0
+// ----+-----+-----+---------
+//  -1    0     1      >= 0
+//
+// All dimensions as set as VarKind::SetDim.
+FlatLinearValueConstraints
+FlatLinearValueConstraints::getHyperrectangular(ValueRange ivs, ValueRange lbs,
+                                                ValueRange ubs) {
+  FlatLinearValueConstraints res;
+  unsigned nIvs = ivs.size();
+  assert(nIvs == lbs.size() && "expected as many lower bounds as ivs");
+  assert(nIvs == ubs.size() && "expected as many upper bounds as ivs");
+
+  if (nIvs == 0)
+    return res;
+
+  res.appendDimVar(ivs);
+  unsigned lbsStart = res.appendDimVar(lbs);
+  unsigned ubsStart = res.appendDimVar(ubs);
+
+  MLIRContext *ctx = ivs.front().getContext();
+  for (int ivIdx = 0, e = nIvs; ivIdx < e; ++ivIdx) {
+    // iv - lb >= 0
+    AffineMap lb = AffineMap::get(/*dimCount=*/3 * nIvs, /*symbolCount=*/0,
+                                  getAffineDimExpr(lbsStart + ivIdx, ctx));
+    if (failed(res.addBound(BoundType::LB, ivIdx, lb)))
+      llvm_unreachable("Unexpected FlatLinearValueConstraints creation error");
+    // -iv + ub >= 0
+    AffineMap ub = AffineMap::get(/*dimCount=*/3 * nIvs, /*symbolCount=*/0,
+                                  getAffineDimExpr(ubsStart + ivIdx, ctx));
+    if (failed(res.addBound(BoundType::UB, ivIdx, ub)))
+      llvm_unreachable("Unexpected FlatLinearValueConstraints creation error");
+  }
+  return res;
+}
+
+unsigned FlatLinearValueConstraints::appendDimVar(ValueRange vals) {
+  unsigned pos = getNumDimVars();
+  return insertVar(VarKind::SetDim, pos, vals);
+}
+
+unsigned FlatLinearValueConstraints::appendSymbolVar(ValueRange vals) {
+  unsigned pos = getNumSymbolVars();
+  return insertVar(VarKind::Symbol, pos, vals);
+}
+
+unsigned FlatLinearValueConstraints::insertDimVar(unsigned pos,
+                                                  ValueRange vals) {
+  return insertVar(VarKind::SetDim, pos, vals);
+}
+
+unsigned FlatLinearValueConstraints::insertSymbolVar(unsigned pos,
+                                                     ValueRange vals) {
+  return insertVar(VarKind::Symbol, pos, vals);
+}
+
+unsigned FlatLinearValueConstraints::insertVar(VarKind kind, unsigned pos,
+                                               unsigned num) {
+  unsigned absolutePos = IntegerPolyhedron::insertVar(kind, pos, num);
+
+  if (kind != VarKind::Local) {
+    values.insert(values.begin() + absolutePos, num, std::nullopt);
+    assert(values.size() == getNumDimAndSymbolVars());
+  }
+
+  return absolutePos;
+}
+
+unsigned FlatLinearValueConstraints::insertVar(VarKind kind, unsigned pos,
+                                               ValueRange vals) {
+  assert(!vals.empty() && "expected ValueRange with Values.");
+  assert(kind != VarKind::Local &&
+         "values cannot be attached to local variables.");
+  unsigned num = vals.size();
+  unsigned absolutePos = IntegerPolyhedron::insertVar(kind, pos, num);
+
+  // If a Value is provided, insert it; otherwise use None.
+  for (unsigned i = 0; i < num; ++i)
+    values.insert(values.begin() + absolutePos + i,
+                  vals[i] ? std::optional<Value>(vals[i]) : std::nullopt);
+
+  assert(values.size() == getNumDimAndSymbolVars());
+  return absolutePos;
+}
+
+bool FlatLinearValueConstraints::hasValues() const {
+  return llvm::any_of(
+      values, [](const std::optional<Value> &var) { return var.has_value(); });
+}
+
+/// Checks if two constraint systems are in the same space, i.e., if they are
+/// associated with the same set of variables, appearing in the same order.
+static bool areVarsAligned(const FlatLinearValueConstraints &a,
+                           const FlatLinearValueConstraints &b) {
+  return a.getNumDimVars() == b.getNumDimVars() &&
+         a.getNumSymbolVars() == b.getNumSymbolVars() &&
+         a.getNumVars() == b.getNumVars() &&
+         a.getMaybeValues().equals(b.getMaybeValues());
+}
+
+/// Calls areVarsAligned to check if two constraint systems have the same set
+/// of variables in the same order.
+bool FlatLinearValueConstraints::areVarsAlignedWithOther(
+    const FlatLinearConstraints &other) {
+  return areVarsAligned(*this, other);
+}
+
+/// Checks if the SSA values associated with `cst`'s variables in range
+/// [start, end) are unique.
+static bool LLVM_ATTRIBUTE_UNUSED areVarsUnique(
+    const FlatLinearValueConstraints &cst, unsigned start, unsigned end) {
+
+  assert(start <= cst.getNumDimAndSymbolVars() &&
+         "Start position out of bounds");
+  assert(end <= cst.getNumDimAndSymbolVars() && "End position out of bounds");
+
+  if (start >= end)
+    return true;
+
+  SmallPtrSet<Value, 8> uniqueVars;
+  ArrayRef<std::optional<Value>> maybeValues =
+      cst.getMaybeValues().slice(start, end - start);
+  for (std::optional<Value> val : maybeValues) {
+    if (val && !uniqueVars.insert(*val).second)
+      return false;
+  }
+  return true;
+}
+
+/// Checks if the SSA values associated with `cst`'s variables are unique.
+static bool LLVM_ATTRIBUTE_UNUSED
+areVarsUnique(const FlatLinearValueConstraints &cst) {
+  return areVarsUnique(cst, 0, cst.getNumDimAndSymbolVars());
+}
+
+/// Checks if the SSA values associated with `cst`'s variables of kind `kind`
+/// are unique.
+static bool LLVM_ATTRIBUTE_UNUSED
+areVarsUnique(const FlatLinearValueConstraints &cst, VarKind kind) {
+
+  if (kind == VarKind::SetDim)
+    return areVarsUnique(cst, 0, cst.getNumDimVars());
+  if (kind == VarKind::Symbol)
+    return areVarsUnique(cst, cst.getNumDimVars(),
+                         cst.getNumDimAndSymbolVars());
+  llvm_unreachable("Unexpected VarKind");
+}
+
+/// Merge and align the variables of A and B starting at 'offset', so that
+/// both constraint systems get the union of the contained variables that is
+/// dimension-wise and symbol-wise unique; both constraint systems are updated
+/// so that they have the union of all variables, with A's original
+/// variables appearing first followed by any of B's variables that didn't
+/// appear in A. Local variables in B that have the same division
+/// representation as local variables in A are merged into one.
+//  E.g.: Input: A has ((%i, %j) [%M, %N]) and B has (%k, %j) [%P, %N, %M])
+//        Output: both A, B have (%i, %j, %k) [%M, %N, %P]
+static void mergeAndAlignVars(unsigned offset, FlatLinearValueConstraints *a,
+                              FlatLinearValueConstraints *b) {
+  assert(offset <= a->getNumDimVars() && offset <= b->getNumDimVars());
+  // A merge/align isn't meaningful if a cst's vars aren't distinct.
+  assert(areVarsUnique(*a) && "A's values aren't unique");
+  assert(areVarsUnique(*b) && "B's values aren't unique");
+
+  assert(llvm::all_of(
+      llvm::drop_begin(a->getMaybeValues(), offset),
+      [](const std::optional<Value> &var) { return var.has_value(); }));
+
+  assert(llvm::all_of(
+      llvm::drop_begin(b->getMaybeValues(), offset),
+      [](const std::optional<Value> &var) { return var.has_value(); }));
+
+  SmallVector<Value, 4> aDimValues;
+  a->getValues(offset, a->getNumDimVars(), &aDimValues);
+
+  {
+    // Merge dims from A into B.
+    unsigned d = offset;
+    for (auto aDimValue : aDimValues) {
+      unsigned loc;
+      if (b->findVar(aDimValue, &loc)) {
+        assert(loc >= offset && "A's dim appears in B's aligned range");
+        assert(loc < b->getNumDimVars() &&
+               "A's dim appears in B's non-dim position");
+        b->swapVar(d, loc);
+      } else {
+        b->insertDimVar(d, aDimValue);
+      }
+      d++;
+    }
+    // Dimensions that are in B, but not in A, are added at the end.
+    for (unsigned t = a->getNumDimVars(), e = b->getNumDimVars(); t < e; t++) {
+      a->appendDimVar(b->getValue(t));
+    }
+    assert(a->getNumDimVars() == b->getNumDimVars() &&
+           "expected same number of dims");
+  }
+
+  // Merge and align symbols of A and B
+  a->mergeSymbolVars(*b);
+  // Merge and align locals of A and B
+  a->mergeLocalVars(*b);
+
+  assert(areVarsAligned(*a, *b) && "IDs expected to be aligned");
+}
+
+// Call 'mergeAndAlignVars' to align constraint systems of 'this' and 'other'.
+void FlatLinearValueConstraints::mergeAndAlignVarsWithOther(
+    unsigned offset, FlatLinearValueConstraints *other) {
+  mergeAndAlignVars(offset, this, other);
+}
+
+/// Merge and align symbols of `this` and `other` such that both get union of
+/// of symbols that are unique. Symbols in `this` and `other` should be
+/// unique. Symbols with Value as `None` are considered to be inequal to all
+/// other symbols.
+void FlatLinearValueConstraints::mergeSymbolVars(
+    FlatLinearValueConstraints &other) {
+
+  assert(areVarsUnique(*this, VarKind::Symbol) && "Symbol vars are not unique");
+  assert(areVarsUnique(other, VarKind::Symbol) && "Symbol vars are not unique");
+
+  SmallVector<Value, 4> aSymValues;
+  getValues(getNumDimVars(), getNumDimAndSymbolVars(), &aSymValues);
+
+  // Merge symbols: merge symbols into `other` first from `this`.
+  unsigned s = other.getNumDimVars();
+  for (Value aSymValue : aSymValues) {
+    unsigned loc;
+    // If the var is a symbol in `other`, then align it, otherwise assume that
+    // it is a new symbol
+    if (other.findVar(aSymValue, &loc) && loc >= other.getNumDimVars() &&
+        loc < other.getNumDimAndSymbolVars())
+      other.swapVar(s, loc);
+    else
+      other.insertSymbolVar(s - other.getNumDimVars(), aSymValue);
+    s++;
+  }
+
+  // Symbols that are in other, but not in this, are added at the end.
+  for (unsigned t = other.getNumDimVars() + getNumSymbolVars(),
+                e = other.getNumDimAndSymbolVars();
+       t < e; t++)
+    insertSymbolVar(getNumSymbolVars(), other.getValue(t));
+
+  assert(getNumSymbolVars() == other.getNumSymbolVars() &&
+         "expected same number of symbols");
+  assert(areVarsUnique(*this, VarKind::Symbol) && "Symbol vars are not unique");
+  assert(areVarsUnique(other, VarKind::Symbol) && "Symbol vars are not unique");
+}
+
+bool FlatLinearValueConstraints::hasConsistentState() const {
+  return IntegerPolyhedron::hasConsistentState() &&
+         values.size() == getNumDimAndSymbolVars();
+}
+
+void FlatLinearValueConstraints::removeVarRange(VarKind kind, unsigned varStart,
+                                                unsigned varLimit) {
+  IntegerPolyhedron::removeVarRange(kind, varStart, varLimit);
+  unsigned offset = getVarKindOffset(kind);
+
+  if (kind != VarKind::Local) {
+    values.erase(values.begin() + varStart + offset,
+                 values.begin() + varLimit + offset);
+  }
+}
+
+AffineMap
+FlatLinearValueConstraints::computeAlignedMap(AffineMap map,
+                                              ValueRange operands) const {
+  assert(map.getNumInputs() == operands.size() && "number of inputs mismatch");
+
+  SmallVector<Value> dims, syms;
+#ifndef NDEBUG
+  SmallVector<Value> newSyms;
+  SmallVector<Value> *newSymsPtr = &newSyms;
+#else
+  SmallVector<Value> *newSymsPtr = nullptr;
+#endif // NDEBUG
+
+  dims.reserve(getNumDimVars());
+  syms.reserve(getNumSymbolVars());
+  for (unsigned i = getVarKindOffset(VarKind::SetDim),
+                e = getVarKindEnd(VarKind::SetDim);
+       i < e; ++i)
+    dims.push_back(values[i] ? *values[i] : Value());
+  for (unsigned i = getVarKindOffset(VarKind::Symbol),
+                e = getVarKindEnd(VarKind::Symbol);
+       i < e; ++i)
+    syms.push_back(values[i] ? *values[i] : Value());
+
+  AffineMap alignedMap =
+      alignAffineMapWithValues(map, operands, dims, syms, newSymsPtr);
+  // All symbols are already part of this FlatAffineValueConstraints.
+  assert(syms.size() == newSymsPtr->size() && "unexpected new/missing symbols");
+  assert(std::equal(syms.begin(), syms.end(), newSymsPtr->begin()) &&
+         "unexpected new/missing symbols");
+  return alignedMap;
+}
+
+bool FlatLinearValueConstraints::findVar(Value val, unsigned *pos) const {
+  unsigned i = 0;
+  for (const auto &mayBeVar : values) {
+    if (mayBeVar && *mayBeVar == val) {
+      *pos = i;
+      return true;
+    }
+    i++;
+  }
+  return false;
+}
+
+bool FlatLinearValueConstraints::containsVar(Value val) const {
+  return llvm::any_of(values, [&](const std::optional<Value> &mayBeVar) {
+    return mayBeVar && *mayBeVar == val;
+  });
+}
+
+void FlatLinearValueConstraints::swapVar(unsigned posA, unsigned posB) {
+  IntegerPolyhedron::swapVar(posA, posB);
+
+  if (getVarKindAt(posA) == VarKind::Local &&
+      getVarKindAt(posB) == VarKind::Local)
+    return;
+
+  // Treat value of a local variable as std::nullopt.
+  if (getVarKindAt(posA) == VarKind::Local)
+    values[posB] = std::nullopt;
+  else if (getVarKindAt(posB) == VarKind::Local)
+    values[posA] = std::nullopt;
+  else
+    std::swap(values[posA], values[posB]);
+}
+
+void FlatLinearValueConstraints::addBound(BoundType type, Value val,
+                                          int64_t value) {
+  unsigned pos;
+  if (!findVar(val, &pos))
+    // This is a pre-condition for this method.
+    assert(0 && "var not found");
+  addBound(type, pos, value);
+}
+
+void FlatLinearConstraints::printSpace(raw_ostream &os) const {
+  IntegerPolyhedron::printSpace(os);
+  os << "(";
+  for (unsigned i = 0, e = getNumDimAndSymbolVars(); i < e; i++)
+    os << "None\t";
+  for (unsigned i = getVarKindOffset(VarKind::Local),
+                e = getVarKindEnd(VarKind::Local);
+       i < e; ++i)
+    os << "Local\t";
+  os << "const)\n";
+}
+
+void FlatLinearValueConstraints::printSpace(raw_ostream &os) const {
+  IntegerPolyhedron::printSpace(os);
+  os << "(";
+  for (unsigned i = 0, e = getNumDimAndSymbolVars(); i < e; i++) {
+    if (hasValue(i))
+      os << "Value\t";
+    else
+      os << "None\t";
+  }
+  for (unsigned i = getVarKindOffset(VarKind::Local),
+                e = getVarKindEnd(VarKind::Local);
+       i < e; ++i)
+    os << "Local\t";
+  os << "const)\n";
+}
+
+void FlatLinearValueConstraints::clearAndCopyFrom(
+    const IntegerRelation &other) {
+
+  if (auto *otherValueSet =
+          dyn_cast<const FlatLinearValueConstraints>(&other)) {
+    *this = *otherValueSet;
+  } else {
+    *static_cast<IntegerRelation *>(this) = other;
+    values.clear();
+    values.resize(getNumDimAndSymbolVars(), std::nullopt);
+  }
+}
+
+void FlatLinearValueConstraints::fourierMotzkinEliminate(
+    unsigned pos, bool darkShadow, bool *isResultIntegerExact) {
+  SmallVector<std::optional<Value>, 8> newVals = values;
+  if (getVarKindAt(pos) != VarKind::Local)
+    newVals.erase(newVals.begin() + pos);
+  // Note: Base implementation discards all associated Values.
+  IntegerPolyhedron::fourierMotzkinEliminate(pos, darkShadow,
+                                             isResultIntegerExact);
+  values = newVals;
+  assert(values.size() == getNumDimAndSymbolVars());
+}
+
+void FlatLinearValueConstraints::projectOut(Value val) {
+  unsigned pos;
+  bool ret = findVar(val, &pos);
+  assert(ret);
+  (void)ret;
+  fourierMotzkinEliminate(pos);
+}
+
+LogicalResult FlatLinearValueConstraints::unionBoundingBox(
+    const FlatLinearValueConstraints &otherCst) {
+  assert(otherCst.getNumDimVars() == getNumDimVars() && "dims mismatch");
+  assert(otherCst.getMaybeValues()
+             .slice(0, getNumDimVars())
+             .equals(getMaybeValues().slice(0, getNumDimVars())) &&
+         "dim values mismatch");
+  assert(otherCst.getNumLocalVars() == 0 && "local vars not supported here");
+  assert(getNumLocalVars() == 0 && "local vars not supported yet here");
+
+  // Align `other` to this.
+  if (!areVarsAligned(*this, otherCst)) {
+    FlatLinearValueConstraints otherCopy(otherCst);
+    mergeAndAlignVars(/*offset=*/getNumDimVars(), this, &otherCopy);
+    return IntegerPolyhedron::unionBoundingBox(otherCopy);
+  }
+
+  return IntegerPolyhedron::unionBoundingBox(otherCst);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+AffineMap mlir::alignAffineMapWithValues(AffineMap map, ValueRange operands,
+                                         ValueRange dims, ValueRange syms,
+                                         SmallVector<Value> *newSyms) {
+  assert(operands.size() == map.getNumInputs() &&
+         "expected same number of operands and map inputs");
+  MLIRContext *ctx = map.getContext();
+  Builder builder(ctx);
+  SmallVector<AffineExpr> dimReplacements(map.getNumDims(), {});
+  unsigned numSymbols = syms.size();
+  SmallVector<AffineExpr> symReplacements(map.getNumSymbols(), {});
+  if (newSyms) {
+    newSyms->clear();
+    newSyms->append(syms.begin(), syms.end());
+  }
+
+  for (const auto &operand : llvm::enumerate(operands)) {
+    // Compute replacement dim/sym of operand.
+    AffineExpr replacement;
+    auto dimIt = std::find(dims.begin(), dims.end(), operand.value());
+    auto symIt = std::find(syms.begin(), syms.end(), operand.value());
+    if (dimIt != dims.end()) {
+      replacement =
+          builder.getAffineDimExpr(std::distance(dims.begin(), dimIt));
+    } else if (symIt != syms.end()) {
+      replacement =
+          builder.getAffineSymbolExpr(std::distance(syms.begin(), symIt));
+    } else {
+      // This operand is neither a dimension nor a symbol. Add it as a new
+      // symbol.
+      replacement = builder.getAffineSymbolExpr(numSymbols++);
+      if (newSyms)
+        newSyms->push_back(operand.value());
+    }
+    // Add to corresponding replacements vector.
+    if (operand.index() < map.getNumDims()) {
+      dimReplacements[operand.index()] = replacement;
+    } else {
+      symReplacements[operand.index() - map.getNumDims()] = replacement;
+    }
+  }
+
+  return map.replaceDimsAndSymbols(dimReplacements, symReplacements,
+                                   dims.size(), numSymbols);
+}
+
+LogicalResult
+mlir::getMultiAffineFunctionFromMap(AffineMap map,
+                                    MultiAffineFunction &multiAff) {
+  FlatLinearConstraints cst;
+  std::vector<SmallVector<int64_t, 8>> flattenedExprs;
+  LogicalResult result = getFlattenedAffineExprs(map, &flattenedExprs, &cst);
+
+  if (result.failed())
+    return failure();
+
+  DivisionRepr divs = cst.getLocalReprs();
+  assert(divs.hasAllReprs() &&
+         "AffineMap cannot produce divs without local representation");
+
+  // TODO: We shouldn't have to do this conversion.
+  Matrix mat(map.getNumResults(), map.getNumInputs() + divs.getNumDivs() + 1);
+  for (unsigned i = 0, e = flattenedExprs.size(); i < e; ++i)
+    for (unsigned j = 0, f = flattenedExprs[i].size(); j < f; ++j)
+      mat(i, j) = flattenedExprs[i][j];
+
+  multiAff = MultiAffineFunction(
+      PresburgerSpace::getRelationSpace(map.getNumDims(), map.getNumResults(),
+                                        map.getNumSymbols(), divs.getNumDivs()),
+      mat, divs);
+
+  return success();
+}
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
index 03b8b1d72a5fa..f087dca20f34c 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
@@ -33,504 +33,6 @@
 using namespace mlir;
 using namespace presburger;
 
-namespace {
-
-// See comments for SimpleAffineExprFlattener.
-// An AffineExprFlattener extends a SimpleAffineExprFlattener by recording
-// constraint information associated with mod's, floordiv's, and ceildiv's
-// in FlatAffineValueConstraints 'localVarCst'.
-struct AffineExprFlattener : public SimpleAffineExprFlattener {
-public:
-  // Constraints connecting newly introduced local variables (for mod's and
-  // div's) to existing (dimensional and symbolic) ones. These are always
-  // inequalities.
-  IntegerPolyhedron localVarCst;
-
-  AffineExprFlattener(unsigned nDims, unsigned nSymbols)
-      : SimpleAffineExprFlattener(nDims, nSymbols),
-        localVarCst(PresburgerSpace::getSetSpace(nDims, nSymbols)) {}
-
-private:
-  // Add a local variable (needed to flatten a mod, floordiv, ceildiv expr).
-  // The local variable added is always a floordiv of a pure add/mul affine
-  // function of other variables, coefficients of which are specified in
-  // `dividend' and with respect to the positive constant `divisor'. localExpr
-  // is the simplified tree expression (AffineExpr) corresponding to the
-  // quantifier.
-  void addLocalFloorDivId(ArrayRef<int64_t> dividend, int64_t divisor,
-                          AffineExpr localExpr) override {
-    SimpleAffineExprFlattener::addLocalFloorDivId(dividend, divisor, localExpr);
-    // Update localVarCst.
-    localVarCst.addLocalFloorDiv(dividend, divisor);
-  }
-};
-
-} // namespace
-
-// Flattens the expressions in map. Returns failure if 'expr' was unable to be
-// flattened (i.e., semi-affine expressions not handled yet).
-static LogicalResult
-getFlattenedAffineExprs(ArrayRef<AffineExpr> exprs, unsigned numDims,
-                        unsigned numSymbols,
-                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
-                        FlatAffineValueConstraints *localVarCst) {
-  if (exprs.empty()) {
-    if (localVarCst)
-      *localVarCst = FlatAffineValueConstraints(numDims, numSymbols);
-    return success();
-  }
-
-  AffineExprFlattener flattener(numDims, numSymbols);
-  // Use the same flattener to simplify each expression successively. This way
-  // local variables / expressions are shared.
-  for (auto expr : exprs) {
-    if (!expr.isPureAffine())
-      return failure();
-
-    flattener.walkPostOrder(expr);
-  }
-
-  assert(flattener.operandExprStack.size() == exprs.size());
-  flattenedExprs->clear();
-  flattenedExprs->assign(flattener.operandExprStack.begin(),
-                         flattener.operandExprStack.end());
-
-  if (localVarCst)
-    localVarCst->clearAndCopyFrom(flattener.localVarCst);
-
-  return success();
-}
-
-// Flattens 'expr' into 'flattenedExpr'. Returns failure if 'expr' was unable to
-// be flattened (semi-affine expressions not handled yet).
-LogicalResult
-mlir::getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
-                             unsigned numSymbols,
-                             SmallVectorImpl<int64_t> *flattenedExpr,
-                             FlatAffineValueConstraints *localVarCst) {
-  std::vector<SmallVector<int64_t, 8>> flattenedExprs;
-  LogicalResult ret = ::getFlattenedAffineExprs({expr}, numDims, numSymbols,
-                                                &flattenedExprs, localVarCst);
-  *flattenedExpr = flattenedExprs[0];
-  return ret;
-}
-
-/// Flattens the expressions in map. Returns failure if 'expr' was unable to be
-/// flattened (i.e., semi-affine expressions not handled yet).
-LogicalResult mlir::getFlattenedAffineExprs(
-    AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
-    FlatAffineValueConstraints *localVarCst) {
-  if (map.getNumResults() == 0) {
-    if (localVarCst)
-      *localVarCst =
-          FlatAffineValueConstraints(map.getNumDims(), map.getNumSymbols());
-    return success();
-  }
-  return ::getFlattenedAffineExprs(map.getResults(), map.getNumDims(),
-                                   map.getNumSymbols(), flattenedExprs,
-                                   localVarCst);
-}
-
-LogicalResult mlir::getFlattenedAffineExprs(
-    IntegerSet set, std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
-    FlatAffineValueConstraints *localVarCst) {
-  if (set.getNumConstraints() == 0) {
-    if (localVarCst)
-      *localVarCst =
-          FlatAffineValueConstraints(set.getNumDims(), set.getNumSymbols());
-    return success();
-  }
-  return ::getFlattenedAffineExprs(set.getConstraints(), set.getNumDims(),
-                                   set.getNumSymbols(), flattenedExprs,
-                                   localVarCst);
-}
-
-//===----------------------------------------------------------------------===//
-// FlatAffineConstraints / FlatAffineValueConstraints.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<FlatAffineValueConstraints>
-FlatAffineValueConstraints::clone() const {
-  return std::make_unique<FlatAffineValueConstraints>(*this);
-}
-
-// Construct from an IntegerSet.
-FlatAffineValueConstraints::FlatAffineValueConstraints(IntegerSet set,
-                                                       ValueRange operands)
-    : IntegerPolyhedron(set.getNumInequalities(), set.getNumEqualities(),
-                        set.getNumDims() + set.getNumSymbols() + 1,
-                        PresburgerSpace::getSetSpace(set.getNumDims(),
-                                                     set.getNumSymbols(),
-                                                     /*numLocals=*/0)) {
-  // Populate values.
-  if (operands.empty()) {
-    values.resize(getNumDimAndSymbolVars(), std::nullopt);
-  } else {
-    assert(set.getNumInputs() == operands.size() && "operand count mismatch");
-    values.assign(operands.begin(), operands.end());
-  }
-
-  // Flatten expressions and add them to the constraint system.
-  std::vector<SmallVector<int64_t, 8>> flatExprs;
-  FlatAffineValueConstraints localVarCst;
-  if (failed(getFlattenedAffineExprs(set, &flatExprs, &localVarCst))) {
-    assert(false && "flattening unimplemented for semi-affine integer sets");
-    return;
-  }
-  assert(flatExprs.size() == set.getNumConstraints());
-  insertVar(VarKind::Local, getNumVarKind(VarKind::Local),
-            /*num=*/localVarCst.getNumLocalVars());
-
-  for (unsigned i = 0, e = flatExprs.size(); i < e; ++i) {
-    const auto &flatExpr = flatExprs[i];
-    assert(flatExpr.size() == getNumCols());
-    if (set.getEqFlags()[i]) {
-      addEquality(flatExpr);
-    } else {
-      addInequality(flatExpr);
-    }
-  }
-  // Add the other constraints involving local vars from flattening.
-  append(localVarCst);
-}
-
-// Construct a hyperrectangular constraint set from ValueRanges that represent
-// induction variables, lower and upper bounds. `ivs`, `lbs` and `ubs` are
-// expected to match one to one. The order of variables and constraints is:
-//
-// ivs | lbs | ubs | eq/ineq
-// ----+-----+-----+---------
-//   1   -1     0      >= 0
-// ----+-----+-----+---------
-//  -1    0     1      >= 0
-//
-// All dimensions as set as VarKind::SetDim.
-FlatAffineValueConstraints
-FlatAffineValueConstraints::getHyperrectangular(ValueRange ivs, ValueRange lbs,
-                                                ValueRange ubs) {
-  FlatAffineValueConstraints res;
-  unsigned nIvs = ivs.size();
-  assert(nIvs == lbs.size() && "expected as many lower bounds as ivs");
-  assert(nIvs == ubs.size() && "expected as many upper bounds as ivs");
-
-  if (nIvs == 0)
-    return res;
-
-  res.appendDimVar(ivs);
-  unsigned lbsStart = res.appendDimVar(lbs);
-  unsigned ubsStart = res.appendDimVar(ubs);
-
-  MLIRContext *ctx = ivs.front().getContext();
-  for (int ivIdx = 0, e = nIvs; ivIdx < e; ++ivIdx) {
-    // iv - lb >= 0
-    AffineMap lb = AffineMap::get(/*dimCount=*/3 * nIvs, /*symbolCount=*/0,
-                                  getAffineDimExpr(lbsStart + ivIdx, ctx));
-    if (failed(res.addBound(BoundType::LB, ivIdx, lb)))
-      llvm_unreachable("Unexpected FlatAffineValueConstraints creation error");
-    // -iv + ub >= 0
-    AffineMap ub = AffineMap::get(/*dimCount=*/3 * nIvs, /*symbolCount=*/0,
-                                  getAffineDimExpr(ubsStart + ivIdx, ctx));
-    if (failed(res.addBound(BoundType::UB, ivIdx, ub)))
-      llvm_unreachable("Unexpected FlatAffineValueConstraints creation error");
-  }
-  return res;
-}
-
-unsigned FlatAffineValueConstraints::appendDimVar(ValueRange vals) {
-  unsigned pos = getNumDimVars();
-  return insertVar(VarKind::SetDim, pos, vals);
-}
-
-unsigned FlatAffineValueConstraints::appendSymbolVar(ValueRange vals) {
-  unsigned pos = getNumSymbolVars();
-  return insertVar(VarKind::Symbol, pos, vals);
-}
-
-unsigned FlatAffineValueConstraints::insertDimVar(unsigned pos,
-                                                  ValueRange vals) {
-  return insertVar(VarKind::SetDim, pos, vals);
-}
-
-unsigned FlatAffineValueConstraints::insertSymbolVar(unsigned pos,
-                                                     ValueRange vals) {
-  return insertVar(VarKind::Symbol, pos, vals);
-}
-
-unsigned FlatAffineValueConstraints::insertVar(VarKind kind, unsigned pos,
-                                               unsigned num) {
-  unsigned absolutePos = IntegerPolyhedron::insertVar(kind, pos, num);
-
-  if (kind != VarKind::Local) {
-    values.insert(values.begin() + absolutePos, num, std::nullopt);
-    assert(values.size() == getNumDimAndSymbolVars());
-  }
-
-  return absolutePos;
-}
-
-unsigned FlatAffineValueConstraints::insertVar(VarKind kind, unsigned pos,
-                                               ValueRange vals) {
-  assert(!vals.empty() && "expected ValueRange with Values.");
-  assert(kind != VarKind::Local &&
-         "values cannot be attached to local variables.");
-  unsigned num = vals.size();
-  unsigned absolutePos = IntegerPolyhedron::insertVar(kind, pos, num);
-
-  // If a Value is provided, insert it; otherwise use None.
-  for (unsigned i = 0; i < num; ++i)
-    values.insert(values.begin() + absolutePos + i,
-                  vals[i] ? std::optional<Value>(vals[i]) : std::nullopt);
-
-  assert(values.size() == getNumDimAndSymbolVars());
-  return absolutePos;
-}
-
-bool FlatAffineValueConstraints::hasValues() const {
-  return llvm::any_of(
-      values, [](const std::optional<Value> &var) { return var.has_value(); });
-}
-
-/// Checks if two constraint systems are in the same space, i.e., if they are
-/// associated with the same set of variables, appearing in the same order.
-static bool areVarsAligned(const FlatAffineValueConstraints &a,
-                           const FlatAffineValueConstraints &b) {
-  return a.getNumDimVars() == b.getNumDimVars() &&
-         a.getNumSymbolVars() == b.getNumSymbolVars() &&
-         a.getNumVars() == b.getNumVars() &&
-         a.getMaybeValues().equals(b.getMaybeValues());
-}
-
-/// Calls areVarsAligned to check if two constraint systems have the same set
-/// of variables in the same order.
-bool FlatAffineValueConstraints::areVarsAlignedWithOther(
-    const FlatAffineValueConstraints &other) {
-  return areVarsAligned(*this, other);
-}
-
-/// Checks if the SSA values associated with `cst`'s variables in range
-/// [start, end) are unique.
-static bool LLVM_ATTRIBUTE_UNUSED areVarsUnique(
-    const FlatAffineValueConstraints &cst, unsigned start, unsigned end) {
-
-  assert(start <= cst.getNumDimAndSymbolVars() &&
-         "Start position out of bounds");
-  assert(end <= cst.getNumDimAndSymbolVars() && "End position out of bounds");
-
-  if (start >= end)
-    return true;
-
-  SmallPtrSet<Value, 8> uniqueVars;
-  ArrayRef<std::optional<Value>> maybeValues =
-      cst.getMaybeValues().slice(start, end - start);
-  for (std::optional<Value> val : maybeValues) {
-    if (val && !uniqueVars.insert(*val).second)
-      return false;
-  }
-  return true;
-}
-
-/// Checks if the SSA values associated with `cst`'s variables are unique.
-static bool LLVM_ATTRIBUTE_UNUSED
-areVarsUnique(const FlatAffineValueConstraints &cst) {
-  return areVarsUnique(cst, 0, cst.getNumDimAndSymbolVars());
-}
-
-/// Checks if the SSA values associated with `cst`'s variables of kind `kind`
-/// are unique.
-static bool LLVM_ATTRIBUTE_UNUSED
-areVarsUnique(const FlatAffineValueConstraints &cst, VarKind kind) {
-
-  if (kind == VarKind::SetDim)
-    return areVarsUnique(cst, 0, cst.getNumDimVars());
-  if (kind == VarKind::Symbol)
-    return areVarsUnique(cst, cst.getNumDimVars(),
-                         cst.getNumDimAndSymbolVars());
-  llvm_unreachable("Unexpected VarKind");
-}
-
-/// Merge and align the variables of A and B starting at 'offset', so that
-/// both constraint systems get the union of the contained variables that is
-/// dimension-wise and symbol-wise unique; both constraint systems are updated
-/// so that they have the union of all variables, with A's original
-/// variables appearing first followed by any of B's variables that didn't
-/// appear in A. Local variables in B that have the same division
-/// representation as local variables in A are merged into one.
-//  E.g.: Input: A has ((%i, %j) [%M, %N]) and B has (%k, %j) [%P, %N, %M])
-//        Output: both A, B have (%i, %j, %k) [%M, %N, %P]
-static void mergeAndAlignVars(unsigned offset, FlatAffineValueConstraints *a,
-                              FlatAffineValueConstraints *b) {
-  assert(offset <= a->getNumDimVars() && offset <= b->getNumDimVars());
-  // A merge/align isn't meaningful if a cst's vars aren't distinct.
-  assert(areVarsUnique(*a) && "A's values aren't unique");
-  assert(areVarsUnique(*b) && "B's values aren't unique");
-
-  assert(llvm::all_of(
-      llvm::drop_begin(a->getMaybeValues(), offset),
-      [](const std::optional<Value> &var) { return var.has_value(); }));
-
-  assert(llvm::all_of(
-      llvm::drop_begin(b->getMaybeValues(), offset),
-      [](const std::optional<Value> &var) { return var.has_value(); }));
-
-  SmallVector<Value, 4> aDimValues;
-  a->getValues(offset, a->getNumDimVars(), &aDimValues);
-
-  {
-    // Merge dims from A into B.
-    unsigned d = offset;
-    for (auto aDimValue : aDimValues) {
-      unsigned loc;
-      if (b->findVar(aDimValue, &loc)) {
-        assert(loc >= offset && "A's dim appears in B's aligned range");
-        assert(loc < b->getNumDimVars() &&
-               "A's dim appears in B's non-dim position");
-        b->swapVar(d, loc);
-      } else {
-        b->insertDimVar(d, aDimValue);
-      }
-      d++;
-    }
-    // Dimensions that are in B, but not in A, are added at the end.
-    for (unsigned t = a->getNumDimVars(), e = b->getNumDimVars(); t < e; t++) {
-      a->appendDimVar(b->getValue(t));
-    }
-    assert(a->getNumDimVars() == b->getNumDimVars() &&
-           "expected same number of dims");
-  }
-
-  // Merge and align symbols of A and B
-  a->mergeSymbolVars(*b);
-  // Merge and align locals of A and B
-  a->mergeLocalVars(*b);
-
-  assert(areVarsAligned(*a, *b) && "IDs expected to be aligned");
-}
-
-// Call 'mergeAndAlignVars' to align constraint systems of 'this' and 'other'.
-void FlatAffineValueConstraints::mergeAndAlignVarsWithOther(
-    unsigned offset, FlatAffineValueConstraints *other) {
-  mergeAndAlignVars(offset, this, other);
-}
-
-LogicalResult
-FlatAffineValueConstraints::composeMap(const AffineValueMap *vMap) {
-  return composeMatchingMap(
-      computeAlignedMap(vMap->getAffineMap(), vMap->getOperands()));
-}
-
-// Similar to `composeMap` except that no Values need be associated with the
-// constraint system nor are they looked at -- the dimensions and symbols of
-// `other` are expected to correspond 1:1 to `this` system.
-LogicalResult FlatAffineValueConstraints::composeMatchingMap(AffineMap other) {
-  assert(other.getNumDims() == getNumDimVars() && "dim mismatch");
-  assert(other.getNumSymbols() == getNumSymbolVars() && "symbol mismatch");
-
-  std::vector<SmallVector<int64_t, 8>> flatExprs;
-  if (failed(flattenAlignedMapAndMergeLocals(other, &flatExprs)))
-    return failure();
-  assert(flatExprs.size() == other.getNumResults());
-
-  // Add dimensions corresponding to the map's results.
-  insertDimVar(/*pos=*/0, /*num=*/other.getNumResults());
-
-  // We add one equality for each result connecting the result dim of the map to
-  // the other variables.
-  // E.g.: if the expression is 16*i0 + i1, and this is the r^th
-  // iteration/result of the value map, we are adding the equality:
-  // d_r - 16*i0 - i1 = 0. Similarly, when flattening (i0 + 1, i0 + 8*i2), we
-  // add two equalities: d_0 - i0 - 1 == 0, d1 - i0 - 8*i2 == 0.
-  for (unsigned r = 0, e = flatExprs.size(); r < e; r++) {
-    const auto &flatExpr = flatExprs[r];
-    assert(flatExpr.size() >= other.getNumInputs() + 1);
-
-    SmallVector<int64_t, 8> eqToAdd(getNumCols(), 0);
-    // Set the coefficient for this result to one.
-    eqToAdd[r] = 1;
-
-    // Dims and symbols.
-    for (unsigned i = 0, f = other.getNumInputs(); i < f; i++) {
-      // Negate `eq[r]` since the newly added dimension will be set to this one.
-      eqToAdd[e + i] = -flatExpr[i];
-    }
-    // Local columns of `eq` are at the beginning.
-    unsigned j = getNumDimVars() + getNumSymbolVars();
-    unsigned end = flatExpr.size() - 1;
-    for (unsigned i = other.getNumInputs(); i < end; i++, j++) {
-      eqToAdd[j] = -flatExpr[i];
-    }
-
-    // Constant term.
-    eqToAdd[getNumCols() - 1] = -flatExpr[flatExpr.size() - 1];
-
-    // Add the equality connecting the result of the map to this constraint set.
-    addEquality(eqToAdd);
-  }
-
-  return success();
-}
-
-// Turn a symbol into a dimension.
-static void turnSymbolIntoDim(FlatAffineValueConstraints *cst, Value value) {
-  unsigned pos;
-  if (cst->findVar(value, &pos) && pos >= cst->getNumDimVars() &&
-      pos < cst->getNumDimAndSymbolVars()) {
-    cst->swapVar(pos, cst->getNumDimVars());
-    cst->setDimSymbolSeparation(cst->getNumSymbolVars() - 1);
-  }
-}
-
-/// Merge and align symbols of `this` and `other` such that both get union of
-/// of symbols that are unique. Symbols in `this` and `other` should be
-/// unique. Symbols with Value as `None` are considered to be inequal to all
-/// other symbols.
-void FlatAffineValueConstraints::mergeSymbolVars(
-    FlatAffineValueConstraints &other) {
-
-  assert(areVarsUnique(*this, VarKind::Symbol) && "Symbol vars are not unique");
-  assert(areVarsUnique(other, VarKind::Symbol) && "Symbol vars are not unique");
-
-  SmallVector<Value, 4> aSymValues;
-  getValues(getNumDimVars(), getNumDimAndSymbolVars(), &aSymValues);
-
-  // Merge symbols: merge symbols into `other` first from `this`.
-  unsigned s = other.getNumDimVars();
-  for (Value aSymValue : aSymValues) {
-    unsigned loc;
-    // If the var is a symbol in `other`, then align it, otherwise assume that
-    // it is a new symbol
-    if (other.findVar(aSymValue, &loc) && loc >= other.getNumDimVars() &&
-        loc < other.getNumDimAndSymbolVars())
-      other.swapVar(s, loc);
-    else
-      other.insertSymbolVar(s - other.getNumDimVars(), aSymValue);
-    s++;
-  }
-
-  // Symbols that are in other, but not in this, are added at the end.
-  for (unsigned t = other.getNumDimVars() + getNumSymbolVars(),
-                e = other.getNumDimAndSymbolVars();
-       t < e; t++)
-    insertSymbolVar(getNumSymbolVars(), other.getValue(t));
-
-  assert(getNumSymbolVars() == other.getNumSymbolVars() &&
-         "expected same number of symbols");
-  assert(areVarsUnique(*this, VarKind::Symbol) && "Symbol vars are not unique");
-  assert(areVarsUnique(other, VarKind::Symbol) && "Symbol vars are not unique");
-}
-
-// Changes all symbol variables which are loop IVs to dim variables.
-void FlatAffineValueConstraints::convertLoopIVSymbolsToDims() {
-  // Gather all symbols which are loop IVs.
-  SmallVector<Value, 4> loopIVs;
-  for (unsigned i = getNumDimVars(), e = getNumDimAndSymbolVars(); i < e; i++) {
-    if (hasValue(i) && getForInductionVarOwner(getValue(i)))
-      loopIVs.push_back(getValue(i));
-  }
-  // Turn each symbol in 'loopIVs' into a dim variable.
-  for (auto iv : loopIVs) {
-    turnSymbolIntoDim(this, iv);
-  }
-}
 
 void FlatAffineValueConstraints::addInductionVarOrTerminalSymbol(Value val) {
   if (containsVar(val))
@@ -709,559 +211,6 @@ void FlatAffineValueConstraints::addAffineIfOpDomain(AffineIfOp ifOp) {
   append(cst);
 }
 
-bool FlatAffineValueConstraints::hasConsistentState() const {
-  return IntegerPolyhedron::hasConsistentState() &&
-         values.size() == getNumDimAndSymbolVars();
-}
-
-void FlatAffineValueConstraints::removeVarRange(VarKind kind, unsigned varStart,
-                                                unsigned varLimit) {
-  IntegerPolyhedron::removeVarRange(kind, varStart, varLimit);
-  unsigned offset = getVarKindOffset(kind);
-
-  if (kind != VarKind::Local) {
-    values.erase(values.begin() + varStart + offset,
-                 values.begin() + varLimit + offset);
-  }
-}
-
-// Determine whether the variable at 'pos' (say var_r) can be expressed as
-// modulo of another known variable (say var_n) w.r.t a constant. For example,
-// if the following constraints hold true:
-// ```
-// 0 <= var_r <= divisor - 1
-// var_n - (divisor * q_expr) = var_r
-// ```
-// where `var_n` is a known variable (called dividend), and `q_expr` is an
-// `AffineExpr` (called the quotient expression), `var_r` can be written as:
-//
-// `var_r = var_n mod divisor`.
-//
-// Additionally, in a special case of the above constaints where `q_expr` is an
-// variable itself that is not yet known (say `var_q`), it can be written as a
-// floordiv in the following way:
-//
-// `var_q = var_n floordiv divisor`.
-//
-// Returns true if the above mod or floordiv are detected, updating 'memo' with
-// these new expressions. Returns false otherwise.
-static bool detectAsMod(const FlatAffineValueConstraints &cst, unsigned pos,
-                        int64_t lbConst, int64_t ubConst,
-                        SmallVectorImpl<AffineExpr> &memo,
-                        MLIRContext *context) {
-  assert(pos < cst.getNumVars() && "invalid position");
-
-  // Check if a divisor satisfying the condition `0 <= var_r <= divisor - 1` can
-  // be determined.
-  if (lbConst != 0 || ubConst < 1)
-    return false;
-  int64_t divisor = ubConst + 1;
-
-  // Check for the aforementioned conditions in each equality.
-  for (unsigned curEquality = 0, numEqualities = cst.getNumEqualities();
-       curEquality < numEqualities; curEquality++) {
-    int64_t coefficientAtPos = cst.atEq64(curEquality, pos);
-    // If current equality does not involve `var_r`, continue to the next
-    // equality.
-    if (coefficientAtPos == 0)
-      continue;
-
-    // Constant term should be 0 in this equality.
-    if (cst.atEq64(curEquality, cst.getNumCols() - 1) != 0)
-      continue;
-
-    // Traverse through the equality and construct the dividend expression
-    // `dividendExpr`, to contain all the variables which are known and are
-    // not divisible by `(coefficientAtPos * divisor)`. Hope here is that the
-    // `dividendExpr` gets simplified into a single variable `var_n` discussed
-    // above.
-    auto dividendExpr = getAffineConstantExpr(0, context);
-
-    // Track the terms that go into quotient expression, later used to detect
-    // additional floordiv.
-    unsigned quotientCount = 0;
-    int quotientPosition = -1;
-    int quotientSign = 1;
-
-    // Consider each term in the current equality.
-    unsigned curVar, e;
-    for (curVar = 0, e = cst.getNumDimAndSymbolVars(); curVar < e; ++curVar) {
-      // Ignore var_r.
-      if (curVar == pos)
-        continue;
-      int64_t coefficientOfCurVar = cst.atEq64(curEquality, curVar);
-      // Ignore vars that do not contribute to the current equality.
-      if (coefficientOfCurVar == 0)
-        continue;
-      // Check if the current var goes into the quotient expression.
-      if (coefficientOfCurVar % (divisor * coefficientAtPos) == 0) {
-        quotientCount++;
-        quotientPosition = curVar;
-        quotientSign = (coefficientOfCurVar * coefficientAtPos) > 0 ? 1 : -1;
-        continue;
-      }
-      // Variables that are part of dividendExpr should be known.
-      if (!memo[curVar])
-        break;
-      // Append the current variable to the dividend expression.
-      dividendExpr = dividendExpr + memo[curVar] * coefficientOfCurVar;
-    }
-
-    // Can't construct expression as it depends on a yet uncomputed var.
-    if (curVar < e)
-      continue;
-
-    // Express `var_r` in terms of the other vars collected so far.
-    if (coefficientAtPos > 0)
-      dividendExpr = (-dividendExpr).floorDiv(coefficientAtPos);
-    else
-      dividendExpr = dividendExpr.floorDiv(-coefficientAtPos);
-
-    // Simplify the expression.
-    dividendExpr = simplifyAffineExpr(dividendExpr, cst.getNumDimVars(),
-                                      cst.getNumSymbolVars());
-    // Only if the final dividend expression is just a single var (which we call
-    // `var_n`), we can proceed.
-    // TODO: Handle AffineSymbolExpr as well. There is no reason to restrict it
-    // to dims themselves.
-    auto dimExpr = dividendExpr.dyn_cast<AffineDimExpr>();
-    if (!dimExpr)
-      continue;
-
-    // Express `var_r` as `var_n % divisor` and store the expression in `memo`.
-    if (quotientCount >= 1) {
-      auto ub = cst.getConstantBound64(
-          FlatAffineValueConstraints::BoundType::UB, dimExpr.getPosition());
-      // If `var_n` has an upperbound that is less than the divisor, mod can be
-      // eliminated altogether.
-      if (ub && *ub < divisor)
-        memo[pos] = dimExpr;
-      else
-        memo[pos] = dimExpr % divisor;
-      // If a unique quotient `var_q` was seen, it can be expressed as
-      // `var_n floordiv divisor`.
-      if (quotientCount == 1 && !memo[quotientPosition])
-        memo[quotientPosition] = dimExpr.floorDiv(divisor) * quotientSign;
-
-      return true;
-    }
-  }
-  return false;
-}
-
-/// Check if the pos^th variable can be expressed as a floordiv of an affine
-/// function of other variables (where the divisor is a positive constant)
-/// given the initial set of expressions in `exprs`. If it can be, the
-/// corresponding position in `exprs` is set as the detected affine expr. For
-/// eg: 4q <= i + j <= 4q + 3   <=>   q = (i + j) floordiv 4. An equality can
-/// also yield a floordiv: eg.  4q = i + j <=> q = (i + j) floordiv 4. 32q + 28
-/// <= i <= 32q + 31 => q = i floordiv 32.
-static bool detectAsFloorDiv(const FlatAffineValueConstraints &cst,
-                             unsigned pos, MLIRContext *context,
-                             SmallVectorImpl<AffineExpr> &exprs) {
-  assert(pos < cst.getNumVars() && "invalid position");
-
-  // Get upper-lower bound pair for this variable.
-  SmallVector<bool, 8> foundRepr(cst.getNumVars(), false);
-  for (unsigned i = 0, e = cst.getNumVars(); i < e; ++i)
-    if (exprs[i])
-      foundRepr[i] = true;
-
-  SmallVector<int64_t, 8> dividend(cst.getNumCols());
-  unsigned divisor;
-  auto ulPair = computeSingleVarRepr(cst, foundRepr, pos, dividend, divisor);
-
-  // No upper-lower bound pair found for this var.
-  if (ulPair.kind == ReprKind::None || ulPair.kind == ReprKind::Equality)
-    return false;
-
-  // Construct the dividend expression.
-  auto dividendExpr = getAffineConstantExpr(dividend.back(), context);
-  for (unsigned c = 0, f = cst.getNumVars(); c < f; c++)
-    if (dividend[c] != 0)
-      dividendExpr = dividendExpr + dividend[c] * exprs[c];
-
-  // Successfully detected the floordiv.
-  exprs[pos] = dividendExpr.floorDiv(divisor);
-  return true;
-}
-
-std::pair<AffineMap, AffineMap>
-FlatAffineValueConstraints::getLowerAndUpperBound(
-    unsigned pos, unsigned offset, unsigned num, unsigned symStartPos,
-    ArrayRef<AffineExpr> localExprs, MLIRContext *context,
-    bool closedUB) const {
-  assert(pos + offset < getNumDimVars() && "invalid dim start pos");
-  assert(symStartPos >= (pos + offset) && "invalid sym start pos");
-  assert(getNumLocalVars() == localExprs.size() &&
-         "incorrect local exprs count");
-
-  SmallVector<unsigned, 4> lbIndices, ubIndices, eqIndices;
-  getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices, &eqIndices,
-                               offset, num);
-
-  /// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
-  auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
-    b.clear();
-    for (unsigned i = 0, e = a.size(); i < e; ++i) {
-      if (i < offset || i >= offset + num)
-        b.push_back(a[i]);
-    }
-  };
-
-  SmallVector<int64_t, 8> lb, ub;
-  SmallVector<AffineExpr, 4> lbExprs;
-  unsigned dimCount = symStartPos - num;
-  unsigned symCount = getNumDimAndSymbolVars() - symStartPos;
-  lbExprs.reserve(lbIndices.size() + eqIndices.size());
-  // Lower bound expressions.
-  for (auto idx : lbIndices) {
-    auto ineq = getInequality64(idx);
-    // Extract the lower bound (in terms of other coeff's + const), i.e., if
-    // i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
-    // - 1.
-    addCoeffs(ineq, lb);
-    std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
-    auto expr =
-        getAffineExprFromFlatForm(lb, dimCount, symCount, localExprs, context);
-    // expr ceildiv divisor is (expr + divisor - 1) floordiv divisor
-    int64_t divisor = std::abs(ineq[pos + offset]);
-    expr = (expr + divisor - 1).floorDiv(divisor);
-    lbExprs.push_back(expr);
-  }
-
-  SmallVector<AffineExpr, 4> ubExprs;
-  ubExprs.reserve(ubIndices.size() + eqIndices.size());
-  // Upper bound expressions.
-  for (auto idx : ubIndices) {
-    auto ineq = getInequality64(idx);
-    // Extract the upper bound (in terms of other coeff's + const).
-    addCoeffs(ineq, ub);
-    auto expr =
-        getAffineExprFromFlatForm(ub, dimCount, symCount, localExprs, context);
-    expr = expr.floorDiv(std::abs(ineq[pos + offset]));
-    int64_t ubAdjustment = closedUB ? 0 : 1;
-    ubExprs.push_back(expr + ubAdjustment);
-  }
-
-  // Equalities. It's both a lower and a upper bound.
-  SmallVector<int64_t, 4> b;
-  for (auto idx : eqIndices) {
-    auto eq = getEquality64(idx);
-    addCoeffs(eq, b);
-    if (eq[pos + offset] > 0)
-      std::transform(b.begin(), b.end(), b.begin(), std::negate<int64_t>());
-
-    // Extract the upper bound (in terms of other coeff's + const).
-    auto expr =
-        getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
-    expr = expr.floorDiv(std::abs(eq[pos + offset]));
-    // Upper bound is exclusive.
-    ubExprs.push_back(expr + 1);
-    // Lower bound.
-    expr =
-        getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
-    expr = expr.ceilDiv(std::abs(eq[pos + offset]));
-    lbExprs.push_back(expr);
-  }
-
-  auto lbMap = AffineMap::get(dimCount, symCount, lbExprs, context);
-  auto ubMap = AffineMap::get(dimCount, symCount, ubExprs, context);
-
-  return {lbMap, ubMap};
-}
-
-/// Computes the lower and upper bounds of the first 'num' dimensional
-/// variables (starting at 'offset') as affine maps of the remaining
-/// variables (dimensional and symbolic variables). Local variables are
-/// themselves explicitly computed as affine functions of other variables in
-/// this process if needed.
-void FlatAffineValueConstraints::getSliceBounds(
-    unsigned offset, unsigned num, MLIRContext *context,
-    SmallVectorImpl<AffineMap> *lbMaps, SmallVectorImpl<AffineMap> *ubMaps,
-    bool closedUB) {
-  assert(num < getNumDimVars() && "invalid range");
-
-  // Basic simplification.
-  normalizeConstraintsByGCD();
-
-  LLVM_DEBUG(llvm::dbgs() << "getSliceBounds for first " << num
-                          << " variables\n");
-  LLVM_DEBUG(dump());
-
-  // Record computed/detected variables.
-  SmallVector<AffineExpr, 8> memo(getNumVars());
-  // Initialize dimensional and symbolic variables.
-  for (unsigned i = 0, e = getNumDimVars(); i < e; i++) {
-    if (i < offset)
-      memo[i] = getAffineDimExpr(i, context);
-    else if (i >= offset + num)
-      memo[i] = getAffineDimExpr(i - num, context);
-  }
-  for (unsigned i = getNumDimVars(), e = getNumDimAndSymbolVars(); i < e; i++)
-    memo[i] = getAffineSymbolExpr(i - getNumDimVars(), context);
-
-  bool changed;
-  do {
-    changed = false;
-    // Identify yet unknown variables as constants or mod's / floordiv's of
-    // other variables if possible.
-    for (unsigned pos = 0; pos < getNumVars(); pos++) {
-      if (memo[pos])
-        continue;
-
-      auto lbConst = getConstantBound64(BoundType::LB, pos);
-      auto ubConst = getConstantBound64(BoundType::UB, pos);
-      if (lbConst.has_value() && ubConst.has_value()) {
-        // Detect equality to a constant.
-        if (*lbConst == *ubConst) {
-          memo[pos] = getAffineConstantExpr(*lbConst, context);
-          changed = true;
-          continue;
-        }
-
-        // Detect an variable as modulo of another variable w.r.t a
-        // constant.
-        if (detectAsMod(*this, pos, *lbConst, *ubConst, memo, context)) {
-          changed = true;
-          continue;
-        }
-      }
-
-      // Detect an variable as a floordiv of an affine function of other
-      // variables (divisor is a positive constant).
-      if (detectAsFloorDiv(*this, pos, context, memo)) {
-        changed = true;
-        continue;
-      }
-
-      // Detect an variable as an expression of other variables.
-      unsigned idx;
-      if (!findConstraintWithNonZeroAt(pos, /*isEq=*/true, &idx)) {
-        continue;
-      }
-
-      // Build AffineExpr solving for variable 'pos' in terms of all others.
-      auto expr = getAffineConstantExpr(0, context);
-      unsigned j, e;
-      for (j = 0, e = getNumVars(); j < e; ++j) {
-        if (j == pos)
-          continue;
-        int64_t c = atEq64(idx, j);
-        if (c == 0)
-          continue;
-        // If any of the involved IDs hasn't been found yet, we can't proceed.
-        if (!memo[j])
-          break;
-        expr = expr + memo[j] * c;
-      }
-      if (j < e)
-        // Can't construct expression as it depends on a yet uncomputed
-        // variable.
-        continue;
-
-      // Add constant term to AffineExpr.
-      expr = expr + atEq64(idx, getNumVars());
-      int64_t vPos = atEq64(idx, pos);
-      assert(vPos != 0 && "expected non-zero here");
-      if (vPos > 0)
-        expr = (-expr).floorDiv(vPos);
-      else
-        // vPos < 0.
-        expr = expr.floorDiv(-vPos);
-      // Successfully constructed expression.
-      memo[pos] = expr;
-      changed = true;
-    }
-    // This loop is guaranteed to reach a fixed point - since once an
-    // variable's explicit form is computed (in memo[pos]), it's not updated
-    // again.
-  } while (changed);
-
-  int64_t ubAdjustment = closedUB ? 0 : 1;
-
-  // Set the lower and upper bound maps for all the variables that were
-  // computed as affine expressions of the rest as the "detected expr" and
-  // "detected expr + 1" respectively; set the undetected ones to null.
-  std::optional<FlatAffineValueConstraints> tmpClone;
-  for (unsigned pos = 0; pos < num; pos++) {
-    unsigned numMapDims = getNumDimVars() - num;
-    unsigned numMapSymbols = getNumSymbolVars();
-    AffineExpr expr = memo[pos + offset];
-    if (expr)
-      expr = simplifyAffineExpr(expr, numMapDims, numMapSymbols);
-
-    AffineMap &lbMap = (*lbMaps)[pos];
-    AffineMap &ubMap = (*ubMaps)[pos];
-
-    if (expr) {
-      lbMap = AffineMap::get(numMapDims, numMapSymbols, expr);
-      ubMap = AffineMap::get(numMapDims, numMapSymbols, expr + ubAdjustment);
-    } else {
-      // TODO: Whenever there are local variables in the dependence
-      // constraints, we'll conservatively over-approximate, since we don't
-      // always explicitly compute them above (in the while loop).
-      if (getNumLocalVars() == 0) {
-        // Work on a copy so that we don't update this constraint system.
-        if (!tmpClone) {
-          tmpClone.emplace(FlatAffineValueConstraints(*this));
-          // Removing redundant inequalities is necessary so that we don't get
-          // redundant loop bounds.
-          tmpClone->removeRedundantInequalities();
-        }
-        std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
-            pos, offset, num, getNumDimVars(), /*localExprs=*/{}, context,
-            closedUB);
-      }
-
-      // If the above fails, we'll just use the constant lower bound and the
-      // constant upper bound (if they exist) as the slice bounds.
-      // TODO: being conservative for the moment in cases that
-      // lead to multiple bounds - until getConstDifference in LoopFusion.cpp is
-      // fixed (b/126426796).
-      if (!lbMap || lbMap.getNumResults() > 1) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "WARNING: Potentially over-approximating slice lb\n");
-        auto lbConst = getConstantBound64(BoundType::LB, pos + offset);
-        if (lbConst.has_value()) {
-          lbMap = AffineMap::get(numMapDims, numMapSymbols,
-                                 getAffineConstantExpr(*lbConst, context));
-        }
-      }
-      if (!ubMap || ubMap.getNumResults() > 1) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "WARNING: Potentially over-approximating slice ub\n");
-        auto ubConst = getConstantBound64(BoundType::UB, pos + offset);
-        if (ubConst.has_value()) {
-          ubMap = AffineMap::get(
-              numMapDims, numMapSymbols,
-              getAffineConstantExpr(*ubConst + ubAdjustment, context));
-        }
-      }
-    }
-    LLVM_DEBUG(llvm::dbgs()
-               << "lb map for pos = " << Twine(pos + offset) << ", expr: ");
-    LLVM_DEBUG(lbMap.dump(););
-    LLVM_DEBUG(llvm::dbgs()
-               << "ub map for pos = " << Twine(pos + offset) << ", expr: ");
-    LLVM_DEBUG(ubMap.dump(););
-  }
-}
-
-LogicalResult FlatAffineValueConstraints::flattenAlignedMapAndMergeLocals(
-    AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs) {
-  FlatAffineValueConstraints localCst;
-  if (failed(getFlattenedAffineExprs(map, flattenedExprs, &localCst))) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "composition unimplemented for semi-affine maps\n");
-    return failure();
-  }
-
-  // Add localCst information.
-  if (localCst.getNumLocalVars() > 0) {
-    unsigned numLocalVars = getNumLocalVars();
-    // Insert local dims of localCst at the beginning.
-    insertLocalVar(/*pos=*/0, /*num=*/localCst.getNumLocalVars());
-    // Insert local dims of `this` at the end of localCst.
-    localCst.appendLocalVar(/*num=*/numLocalVars);
-    // Dimensions of localCst and this constraint set match. Append localCst to
-    // this constraint set.
-    append(localCst);
-  }
-
-  return success();
-}
-
-LogicalResult FlatAffineValueConstraints::addBound(BoundType type, unsigned pos,
-                                                   AffineMap boundMap,
-                                                   bool isClosedBound) {
-  assert(boundMap.getNumDims() == getNumDimVars() && "dim mismatch");
-  assert(boundMap.getNumSymbols() == getNumSymbolVars() && "symbol mismatch");
-  assert(pos < getNumDimAndSymbolVars() && "invalid position");
-  assert((type != BoundType::EQ || isClosedBound) &&
-         "EQ bound must be closed.");
-
-  // Equality follows the logic of lower bound except that we add an equality
-  // instead of an inequality.
-  assert((type != BoundType::EQ || boundMap.getNumResults() == 1) &&
-         "single result expected");
-  bool lower = type == BoundType::LB || type == BoundType::EQ;
-
-  std::vector<SmallVector<int64_t, 8>> flatExprs;
-  if (failed(flattenAlignedMapAndMergeLocals(boundMap, &flatExprs)))
-    return failure();
-  assert(flatExprs.size() == boundMap.getNumResults());
-
-  // Add one (in)equality for each result.
-  for (const auto &flatExpr : flatExprs) {
-    SmallVector<int64_t> ineq(getNumCols(), 0);
-    // Dims and symbols.
-    for (unsigned j = 0, e = boundMap.getNumInputs(); j < e; j++) {
-      ineq[j] = lower ? -flatExpr[j] : flatExpr[j];
-    }
-    // Invalid bound: pos appears in `boundMap`.
-    // TODO: This should be an assertion. Fix `addDomainFromSliceMaps` and/or
-    // its callers to prevent invalid bounds from being added.
-    if (ineq[pos] != 0)
-      continue;
-    ineq[pos] = lower ? 1 : -1;
-    // Local columns of `ineq` are at the beginning.
-    unsigned j = getNumDimVars() + getNumSymbolVars();
-    unsigned end = flatExpr.size() - 1;
-    for (unsigned i = boundMap.getNumInputs(); i < end; i++, j++) {
-      ineq[j] = lower ? -flatExpr[i] : flatExpr[i];
-    }
-    // Make the bound closed in if flatExpr is open. The inequality is always
-    // created in the upper bound form, so the adjustment is -1.
-    int64_t boundAdjustment = (isClosedBound || type == BoundType::EQ) ? 0 : -1;
-    // Constant term.
-    ineq[getNumCols() - 1] = (lower ? -flatExpr[flatExpr.size() - 1]
-                                    : flatExpr[flatExpr.size() - 1]) +
-                             boundAdjustment;
-    type == BoundType::EQ ? addEquality(ineq) : addInequality(ineq);
-  }
-
-  return success();
-}
-
-LogicalResult FlatAffineValueConstraints::addBound(BoundType type, unsigned pos,
-                                                   AffineMap boundMap) {
-  return addBound(type, pos, boundMap, /*isClosedBound=*/type != BoundType::UB);
-}
-
-AffineMap
-FlatAffineValueConstraints::computeAlignedMap(AffineMap map,
-                                              ValueRange operands) const {
-  assert(map.getNumInputs() == operands.size() && "number of inputs mismatch");
-
-  SmallVector<Value> dims, syms;
-#ifndef NDEBUG
-  SmallVector<Value> newSyms;
-  SmallVector<Value> *newSymsPtr = &newSyms;
-#else
-  SmallVector<Value> *newSymsPtr = nullptr;
-#endif // NDEBUG
-
-  dims.reserve(getNumDimVars());
-  syms.reserve(getNumSymbolVars());
-  for (unsigned i = getVarKindOffset(VarKind::SetDim),
-                e = getVarKindEnd(VarKind::SetDim);
-       i < e; ++i)
-    dims.push_back(values[i] ? *values[i] : Value());
-  for (unsigned i = getVarKindOffset(VarKind::Symbol),
-                e = getVarKindEnd(VarKind::Symbol);
-       i < e; ++i)
-    syms.push_back(values[i] ? *values[i] : Value());
-
-  AffineMap alignedMap =
-      alignAffineMapWithValues(map, operands, dims, syms, newSymsPtr);
-  // All symbols are already part of this FlatAffineConstraints.
-  assert(syms.size() == newSymsPtr->size() && "unexpected new/missing symbols");
-  assert(std::equal(syms.begin(), syms.end(), newSymsPtr->begin()) &&
-         "unexpected new/missing symbols");
-  return alignedMap;
-}
-
 LogicalResult FlatAffineValueConstraints::addBound(BoundType type, unsigned pos,
                                                    AffineMap boundMap,
                                                    ValueRange boundOperands) {
@@ -1329,149 +278,34 @@ LogicalResult FlatAffineValueConstraints::addSliceBounds(
   return success();
 }
 
-bool FlatAffineValueConstraints::findVar(Value val, unsigned *pos) const {
-  unsigned i = 0;
-  for (const auto &mayBeVar : values) {
-    if (mayBeVar && *mayBeVar == val) {
-      *pos = i;
-      return true;
-    }
-    i++;
-  }
-  return false;
-}
-
-bool FlatAffineValueConstraints::containsVar(Value val) const {
-  return llvm::any_of(values, [&](const std::optional<Value> &mayBeVar) {
-    return mayBeVar && *mayBeVar == val;
-  });
-}
-
-void FlatAffineValueConstraints::swapVar(unsigned posA, unsigned posB) {
-  IntegerPolyhedron::swapVar(posA, posB);
-
-  if (getVarKindAt(posA) == VarKind::Local &&
-      getVarKindAt(posB) == VarKind::Local)
-    return;
-
-  // Treat value of a local variable as std::nullopt.
-  if (getVarKindAt(posA) == VarKind::Local)
-    values[posB] = std::nullopt;
-  else if (getVarKindAt(posB) == VarKind::Local)
-    values[posA] = std::nullopt;
-  else
-    std::swap(values[posA], values[posB]);
+LogicalResult
+FlatAffineValueConstraints::composeMap(const AffineValueMap *vMap) {
+  return composeMatchingMap(
+      computeAlignedMap(vMap->getAffineMap(), vMap->getOperands()));
 }
 
-void FlatAffineValueConstraints::addBound(BoundType type, Value val,
-                                          int64_t value) {
+// Turn a symbol into a dimension.
+static void turnSymbolIntoDim(FlatAffineValueConstraints *cst, Value value) {
   unsigned pos;
-  if (!findVar(val, &pos))
-    // This is a pre-condition for this method.
-    assert(0 && "var not found");
-  addBound(type, pos, value);
-}
-
-void FlatAffineValueConstraints::printSpace(raw_ostream &os) const {
-  IntegerPolyhedron::printSpace(os);
-  os << "(";
-  for (unsigned i = 0, e = getNumDimAndSymbolVars(); i < e; i++) {
-    if (hasValue(i))
-      os << "Value\t";
-    else
-      os << "None\t";
+  if (cst->findVar(value, &pos) && pos >= cst->getNumDimVars() &&
+      pos < cst->getNumDimAndSymbolVars()) {
+    cst->swapVar(pos, cst->getNumDimVars());
+    cst->setDimSymbolSeparation(cst->getNumSymbolVars() - 1);
   }
-  for (unsigned i = getVarKindOffset(VarKind::Local),
-                e = getVarKindEnd(VarKind::Local);
-       i < e; ++i)
-    os << "Local\t";
-  os << "const)\n";
 }
 
-void FlatAffineValueConstraints::clearAndCopyFrom(
-    const IntegerRelation &other) {
-
-  if (auto *otherValueSet =
-          dyn_cast<const FlatAffineValueConstraints>(&other)) {
-    *this = *otherValueSet;
-  } else {
-    *static_cast<IntegerRelation *>(this) = other;
-    values.clear();
-    values.resize(getNumDimAndSymbolVars(), std::nullopt);
+// Changes all symbol variables which are loop IVs to dim variables.
+void FlatAffineValueConstraints::convertLoopIVSymbolsToDims() {
+  // Gather all symbols which are loop IVs.
+  SmallVector<Value, 4> loopIVs;
+  for (unsigned i = getNumDimVars(), e = getNumDimAndSymbolVars(); i < e; i++) {
+    if (hasValue(i) && getForInductionVarOwner(getValue(i)))
+      loopIVs.push_back(getValue(i));
   }
-}
-
-void FlatAffineValueConstraints::fourierMotzkinEliminate(
-    unsigned pos, bool darkShadow, bool *isResultIntegerExact) {
-  SmallVector<std::optional<Value>, 8> newVals = values;
-  if (getVarKindAt(pos) != VarKind::Local)
-    newVals.erase(newVals.begin() + pos);
-  // Note: Base implementation discards all associated Values.
-  IntegerPolyhedron::fourierMotzkinEliminate(pos, darkShadow,
-                                             isResultIntegerExact);
-  values = newVals;
-  assert(values.size() == getNumDimAndSymbolVars());
-}
-
-void FlatAffineValueConstraints::projectOut(Value val) {
-  unsigned pos;
-  bool ret = findVar(val, &pos);
-  assert(ret);
-  (void)ret;
-  fourierMotzkinEliminate(pos);
-}
-
-LogicalResult FlatAffineValueConstraints::unionBoundingBox(
-    const FlatAffineValueConstraints &otherCst) {
-  assert(otherCst.getNumDimVars() == getNumDimVars() && "dims mismatch");
-  assert(otherCst.getMaybeValues()
-             .slice(0, getNumDimVars())
-             .equals(getMaybeValues().slice(0, getNumDimVars())) &&
-         "dim values mismatch");
-  assert(otherCst.getNumLocalVars() == 0 && "local vars not supported here");
-  assert(getNumLocalVars() == 0 && "local vars not supported yet here");
-
-  // Align `other` to this.
-  if (!areVarsAligned(*this, otherCst)) {
-    FlatAffineValueConstraints otherCopy(otherCst);
-    mergeAndAlignVars(/*offset=*/getNumDimVars(), this, &otherCopy);
-    return IntegerPolyhedron::unionBoundingBox(otherCopy);
+  // Turn each symbol in 'loopIVs' into a dim variable.
+  for (auto iv : loopIVs) {
+    turnSymbolIntoDim(this, iv);
   }
-
-  return IntegerPolyhedron::unionBoundingBox(otherCst);
-}
-
-/// Compute an explicit representation for local vars. For all systems coming
-/// from MLIR integer sets, maps, or expressions where local vars were
-/// introduced to model floordivs and mods, this always succeeds.
-static LogicalResult computeLocalVars(const FlatAffineValueConstraints &cst,
-                                      SmallVectorImpl<AffineExpr> &memo,
-                                      MLIRContext *context) {
-  unsigned numDims = cst.getNumDimVars();
-  unsigned numSyms = cst.getNumSymbolVars();
-
-  // Initialize dimensional and symbolic variables.
-  for (unsigned i = 0; i < numDims; i++)
-    memo[i] = getAffineDimExpr(i, context);
-  for (unsigned i = numDims, e = numDims + numSyms; i < e; i++)
-    memo[i] = getAffineSymbolExpr(i - numDims, context);
-
-  bool changed;
-  do {
-    // Each time `changed` is true at the end of this iteration, one or more
-    // local vars would have been detected as floordivs and set in memo; so the
-    // number of null entries in memo[...] strictly reduces; so this converges.
-    changed = false;
-    for (unsigned i = 0, e = cst.getNumLocalVars(); i < e; ++i)
-      if (!memo[numDims + numSyms + i] &&
-          detectAsFloorDiv(cst, /*pos=*/numDims + numSyms + i, context, memo))
-        changed = true;
-  } while (changed);
-
-  ArrayRef<AffineExpr> localExprs =
-      ArrayRef<AffineExpr>(memo).take_back(cst.getNumLocalVars());
-  return success(
-      llvm::all_of(localExprs, [](AffineExpr expr) { return expr; }));
 }
 
 void FlatAffineValueConstraints::getIneqAsAffineValueMap(
@@ -1485,7 +319,7 @@ void FlatAffineValueConstraints::getIneqAsAffineValueMap(
 
   // Get expressions for local vars.
   SmallVector<AffineExpr, 8> memo(getNumVars(), AffineExpr());
-  if (failed(computeLocalVars(*this, memo, context)))
+  if (failed(computeLocalVars(memo, context)))
     assert(false &&
            "one or more local exprs do not have an explicit representation");
   auto localExprs = ArrayRef<AffineExpr>(memo).take_back(getNumLocalVars());
@@ -1519,105 +353,6 @@ void FlatAffineValueConstraints::getIneqAsAffineValueMap(
   vmap.reset(AffineMap::get(numDims - 1, numSyms, boundExpr), operands);
 }
 
-IntegerSet
-FlatAffineValueConstraints::getAsIntegerSet(MLIRContext *context) const {
-  if (getNumConstraints() == 0)
-    // Return universal set (always true): 0 == 0.
-    return IntegerSet::get(getNumDimVars(), getNumSymbolVars(),
-                           getAffineConstantExpr(/*constant=*/0, context),
-                           /*eqFlags=*/true);
-
-  // Construct local references.
-  SmallVector<AffineExpr, 8> memo(getNumVars(), AffineExpr());
-
-  if (failed(computeLocalVars(*this, memo, context))) {
-    // Check if the local variables without an explicit representation have
-    // zero coefficients everywhere.
-    SmallVector<unsigned> noLocalRepVars;
-    unsigned numDimsSymbols = getNumDimAndSymbolVars();
-    for (unsigned i = numDimsSymbols, e = getNumVars(); i < e; ++i) {
-      if (!memo[i] && !isColZero(/*pos=*/i))
-        noLocalRepVars.push_back(i - numDimsSymbols);
-    }
-    if (!noLocalRepVars.empty()) {
-      LLVM_DEBUG({
-        llvm::dbgs() << "local variables at position(s) ";
-        llvm::interleaveComma(noLocalRepVars, llvm::dbgs());
-        llvm::dbgs() << " do not have an explicit representation in:\n";
-        this->dump();
-      });
-      return IntegerSet();
-    }
-  }
-
-  ArrayRef<AffineExpr> localExprs =
-      ArrayRef<AffineExpr>(memo).take_back(getNumLocalVars());
-
-  // Construct the IntegerSet from the equalities/inequalities.
-  unsigned numDims = getNumDimVars();
-  unsigned numSyms = getNumSymbolVars();
-
-  SmallVector<bool, 16> eqFlags(getNumConstraints());
-  std::fill(eqFlags.begin(), eqFlags.begin() + getNumEqualities(), true);
-  std::fill(eqFlags.begin() + getNumEqualities(), eqFlags.end(), false);
-
-  SmallVector<AffineExpr, 8> exprs;
-  exprs.reserve(getNumConstraints());
-
-  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i)
-    exprs.push_back(getAffineExprFromFlatForm(getEquality64(i), numDims,
-                                              numSyms, localExprs, context));
-  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i)
-    exprs.push_back(getAffineExprFromFlatForm(getInequality64(i), numDims,
-                                              numSyms, localExprs, context));
-  return IntegerSet::get(numDims, numSyms, exprs, eqFlags);
-}
-
-AffineMap mlir::alignAffineMapWithValues(AffineMap map, ValueRange operands,
-                                         ValueRange dims, ValueRange syms,
-                                         SmallVector<Value> *newSyms) {
-  assert(operands.size() == map.getNumInputs() &&
-         "expected same number of operands and map inputs");
-  MLIRContext *ctx = map.getContext();
-  Builder builder(ctx);
-  SmallVector<AffineExpr> dimReplacements(map.getNumDims(), {});
-  unsigned numSymbols = syms.size();
-  SmallVector<AffineExpr> symReplacements(map.getNumSymbols(), {});
-  if (newSyms) {
-    newSyms->clear();
-    newSyms->append(syms.begin(), syms.end());
-  }
-
-  for (const auto &operand : llvm::enumerate(operands)) {
-    // Compute replacement dim/sym of operand.
-    AffineExpr replacement;
-    auto dimIt = std::find(dims.begin(), dims.end(), operand.value());
-    auto symIt = std::find(syms.begin(), syms.end(), operand.value());
-    if (dimIt != dims.end()) {
-      replacement =
-          builder.getAffineDimExpr(std::distance(dims.begin(), dimIt));
-    } else if (symIt != syms.end()) {
-      replacement =
-          builder.getAffineSymbolExpr(std::distance(syms.begin(), symIt));
-    } else {
-      // This operand is neither a dimension nor a symbol. Add it as a new
-      // symbol.
-      replacement = builder.getAffineSymbolExpr(numSymbols++);
-      if (newSyms)
-        newSyms->push_back(operand.value());
-    }
-    // Add to corresponding replacements vector.
-    if (operand.index() < map.getNumDims()) {
-      dimReplacements[operand.index()] = replacement;
-    } else {
-      symReplacements[operand.index() - map.getNumDims()] = replacement;
-    }
-  }
-
-  return map.replaceDimsAndSymbols(dimReplacements, symReplacements,
-                                   dims.size(), numSymbols);
-}
-
 FlatAffineValueConstraints FlatAffineRelation::getDomainSet() const {
   FlatAffineValueConstraints domain = *this;
   // Convert all range variables to local variables.
@@ -1806,31 +541,3 @@ LogicalResult mlir::getRelationFromMap(const AffineValueMap &map,
 
   return success();
 }
-
-LogicalResult
-mlir::getMultiAffineFunctionFromMap(AffineMap map,
-                                    MultiAffineFunction &multiAff) {
-  FlatAffineValueConstraints cst;
-  std::vector<SmallVector<int64_t, 8>> flattenedExprs;
-  LogicalResult result = getFlattenedAffineExprs(map, &flattenedExprs, &cst);
-
-  if (result.failed())
-    return failure();
-
-  DivisionRepr divs = cst.getLocalReprs();
-  assert(divs.hasAllReprs() &&
-         "AffineMap cannot produce divs without local representation");
-
-  // TODO: We shouldn't have to do this conversion.
-  Matrix mat(map.getNumResults(), map.getNumInputs() + divs.getNumDivs() + 1);
-  for (unsigned i = 0, e = flattenedExprs.size(); i < e; ++i)
-    for (unsigned j = 0, f = flattenedExprs[i].size(); j < f; ++j)
-      mat(i, j) = flattenedExprs[i][j];
-
-  multiAff = MultiAffineFunction(
-      PresburgerSpace::getRelationSpace(map.getNumDims(), map.getNumResults(),
-                                        map.getNumSymbols(), divs.getNumDivs()),
-      mat, divs);
-
-  return success();
-}
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 554452cb265fd..8564bacedd21c 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -1290,7 +1290,7 @@ void SimpleAffineExprFlattener::addLocalVariableSemiAffine(
 // A floordiv is thus flattened by introducing a new local variable q, and
 // replacing that expression with 'q' while adding the constraints
 // c * q <= expr <= c * q + c - 1 to localVarCst (done by
-// FlatAffineConstraints::addLocalFloorDiv).
+// IntegerRelation::addLocalFloorDiv).
 //
 // A ceildiv is similarly flattened:
 // t = expr ceildiv c   <=> t =  (expr + c - 1) floordiv c
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index c924d2bcde556..39c8ab96aa662 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -829,7 +829,7 @@ bool MutableAffineMap::isMultipleOf(unsigned idx, int64_t factor) const {
   if (results[idx].isMultipleOf(factor))
     return true;
 
-  // TODO: use simplifyAffineExpr and FlatAffineConstraints to
+  // TODO: use simplifyAffineExpr and FlatAffineValueConstraints to
   // complete this (for a more powerful analysis).
   return false;
 }
diff --git a/mlir/test/Transforms/memref-bound-check.mlir b/mlir/test/Transforms/memref-bound-check.mlir
index fce6bdbca4aa1..80909abee51d6 100644
--- a/mlir/test/Transforms/memref-bound-check.mlir
+++ b/mlir/test/Transforms/memref-bound-check.mlir
@@ -201,7 +201,7 @@ func.func @out_of_bounds() {
 // This test case accesses within bounds. Without removal of a certain type of
 // trivially redundant constraints (those differing only in their constant
 // term), the number of constraints here explodes, and this would return out of
-// bounds errors conservatively due to FlatAffineConstraints::kExplosionFactor.
+// bounds errors conservatively due to IntegerRelation::kExplosionFactor.
 #map3 = affine_map<(d0, d1) -> ((d0 * 72 + d1) floordiv 2304 + ((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) floordiv 3)>
 #map4 = affine_map<(d0, d1) -> ((d0 * 72 + d1) mod 2304 - (((d0 * 72 + d1) mod 2304) floordiv 1152) * 1151 - ((((d0 * 72 + d1) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) floordiv 3) * 3)>
 #map5 = affine_map<(d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) floordiv 9) floordiv 8)>
diff --git a/mlir/test/Transforms/memref-dependence-check.mlir b/mlir/test/Transforms/memref-dependence-check.mlir
index 3a16a33a1ed11..f272277cc7904 100644
--- a/mlir/test/Transforms/memref-dependence-check.mlir
+++ b/mlir/test/Transforms/memref-dependence-check.mlir
@@ -636,7 +636,7 @@ func.func @mod_deps() {
   affine.for %i0 = 0 to 10 {
     %a0 = affine.apply affine_map<(d0) -> (d0 mod 2)> (%i0)
     // Results are conservative here since we currently don't have a way to
-    // represent strided sets in FlatAffineConstraints.
+    // represent strided sets in FlatAffineValueConstraints.
     %v0 = affine.load %m[%a0] : memref<100xf32>
     // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
     // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}

From e50f131ae6e22aefdaa502af09a3396f49726976 Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday@polymagelabs.com>
Date: Thu, 23 Mar 2023 13:15:22 +0530
Subject: [PATCH 079/208] [MLIR][Affine] Fix bug and MSAN issue in affine loop
 utils

Fix bug and MSAN issue in affine loop utils introduced by
d25e022cd19b83c22a6022edb78c4b97a5fc1b49 (D146495). While on it,
fix/clean up issues in immediately surrounding code.

Differential Revision: https://reviews.llvm.org/D146698
---
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 38d660d4ff90b..1e567a6db4108 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -2057,6 +2057,8 @@ static LogicalResult generateCopy(
   OpBuilder topBuilder(f.getBody());
   Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
 
+  *sizeInBytes = 0;
+
   if (begin == end)
     return success();
 
@@ -2105,7 +2107,6 @@ static LogicalResult generateCopy(
 
   if (*numElements == 0) {
     LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
-    *sizeInBytes = 0;
     return success();
   }
 
@@ -2183,8 +2184,7 @@ static LogicalResult generateCopy(
     // fastMemRefType is a constant shaped memref.
     auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
     // We don't account for things of unknown size.
-    if (!maySizeInBytes)
-      maySizeInBytes = 0;
+    *sizeInBytes = maySizeInBytes ? *maySizeInBytes : 0;
 
     LLVM_DEBUG(emitRemarkForBlock(*block)
                << "Creating fast buffer of type " << fastMemRefType
@@ -2193,7 +2193,6 @@ static LogicalResult generateCopy(
   } else {
     // Reuse the one already created.
     fastMemRef = fastBufferMap[memref];
-    *sizeInBytes = 0;
   }
 
   auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
@@ -2554,13 +2553,13 @@ LogicalResult mlir::affineDataCopyGenerate(Block::iterator begin,
   if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
     LLVM_DEBUG(forOp.emitRemark()
                << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
-               << " KiB of copy buffers in fast memory space for this block\n");
+               << " KiB of copy buffers in fast memory space for this block");
   }
 
   if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
-    StringRef str = "Total size of all copy buffers' for this block "
-                    "exceeds fast memory capacity\n";
-    block->getParentOp()->emitWarning(str);
+    block->getParentOp()->emitWarning(
+        "total size of all copy buffers' for this block exceeds fast memory "
+        "capacity");
   }
 
   return success();

From 47bff1cc46b5de96841fd1592df0c828e1d38e35 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 23 Mar 2023 09:40:51 +0100
Subject: [PATCH 080/208] [mlir][Analysis][NFC] Make BoundType a top-level enum

`BoundType` is no longer a nested member of `IntegerRelation` but a top-level enum in the `presburger` namespace.

This allows `BoundType` to be predeclared in header files. Nested members cannot be predeclared.

Differential Revision: https://reviews.llvm.org/D146210
---
 .../Analysis/FlatLinearValueConstraints.h     |  9 +++---
 .../Analysis/Presburger/IntegerRelation.h     |  6 ++--
 .../Affine/Analysis/AffineStructures.h        |  4 +--
 .../Analysis/FlatLinearValueConstraints.cpp   |  3 +-
 mlir/lib/Analysis/Presburger/PWMAFunction.cpp |  4 +--
 .../Affine/Analysis/AffineAnalysis.cpp        |  6 ++--
 mlir/lib/Dialect/Affine/Analysis/Utils.cpp    | 31 +++++++++----------
 .../TransformOps/AffineTransformOps.cpp       |  6 ++--
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp   |  4 +--
 mlir/lib/Dialect/Affine/Utils/Utils.cpp       |  7 ++---
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp       |  7 ++---
 .../SCF/Utils/AffineCanonicalizationUtils.cpp | 10 +++---
 .../Presburger/IntegerPolyhedronTest.cpp      |  8 ++---
 13 files changed, 48 insertions(+), 57 deletions(-)

diff --git a/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h b/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
index a6900ab599386..abebd7328f823 100644
--- a/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
+++ b/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
@@ -87,8 +87,8 @@ class FlatLinearConstraints : public presburger::IntegerPolyhedron {
   ///
   /// Note: The dimensions/symbols of this FlatLinearConstraints must match the
   /// dimensions/symbols of the affine map.
-  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap,
-                         bool isClosedBound);
+  LogicalResult addBound(presburger::BoundType type, unsigned pos,
+                         AffineMap boundMap, bool isClosedBound);
 
   /// Adds a bound for the variable at the specified position with constraints
   /// being drawn from the specified bound map. In case of an EQ bound, the
@@ -98,7 +98,8 @@ class FlatLinearConstraints : public presburger::IntegerPolyhedron {
   /// Note: The dimensions/symbols of this FlatLinearConstraints must match the
   /// dimensions/symbols of the affine map. By default the lower bound is closed
   /// and the upper bound is open.
-  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap);
+  LogicalResult addBound(presburger::BoundType type, unsigned pos,
+                         AffineMap boundMap);
 
   /// The `addBound` overload above hides the inherited overloads by default, so
   /// we explicitly introduce them here.
@@ -315,7 +316,7 @@ class FlatLinearValueConstraints : public FlatLinearConstraints {
   void clearAndCopyFrom(const IntegerRelation &other) override;
 
   /// Adds a constant bound for the variable associated with the given Value.
-  void addBound(BoundType type, Value val, int64_t value);
+  void addBound(presburger::BoundType type, Value val, int64_t value);
   using FlatLinearConstraints::addBound;
 
   /// Returns the Value associated with the pos^th variable. Asserts if
diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index 8b0c2a561cfb8..9646894736de0 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -31,6 +31,9 @@ class PresburgerSet;
 class PresburgerRelation;
 struct SymbolicLexMin;
 
+/// The type of bound: equal, lower bound or upper bound.
+enum class BoundType { EQ, LB, UB };
+
 /// An IntegerRelation represents the set of points from a PresburgerSpace that
 /// satisfy a list of affine constraints. Affine constraints can be inequalities
 /// or equalities in the form:
@@ -397,9 +400,6 @@ class IntegerRelation {
   /// to None.
   DivisionRepr getLocalReprs(std::vector<MaybeLocalRepr> *repr = nullptr) const;
 
-  /// The type of bound: equal, lower bound or upper bound.
-  enum BoundType { EQ, LB, UB };
-
   /// Adds a constant bound for the specified variable.
   void addBound(BoundType type, unsigned pos, const MPInt &value);
   void addBound(BoundType type, unsigned pos, int64_t value) {
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h b/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
index 6249428fb8e15..e59836444cc19 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
@@ -100,8 +100,8 @@ class FlatAffineValueConstraints : public FlatLinearValueConstraints {
   /// EQ bound, the  bound map is expected to have exactly one result. In case
   /// of a LB/UB, the bound map may have more than one result, for each of which
   /// an inequality is added.
-  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap,
-                         ValueRange operands);
+  LogicalResult addBound(presburger::BoundType type, unsigned pos,
+                         AffineMap boundMap, ValueRange operands);
   using FlatLinearValueConstraints::addBound;
 
   /// Add the specified values as a dim or symbol var depending on its nature,
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index b89b2d11003af..24c8d871ff97c 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -308,8 +308,7 @@ static bool detectAsMod(const FlatLinearConstraints &cst, unsigned pos,
 
     // Express `var_r` as `var_n % divisor` and store the expression in `memo`.
     if (quotientCount >= 1) {
-      auto ub = cst.getConstantBound64(FlatLinearConstraints::BoundType::UB,
-                                       dimExpr.getPosition());
+      auto ub = cst.getConstantBound64(BoundType::UB, dimExpr.getPosition());
       // If `var_n` has an upperbound that is less than the divisor, mod can be
       // eliminated altogether.
       if (ub && *ub < divisor)
diff --git a/mlir/lib/Analysis/Presburger/PWMAFunction.cpp b/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
index 64b9ba6bf7a0e..ce9e810069c48 100644
--- a/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
+++ b/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
@@ -231,14 +231,14 @@ MultiAffineFunction::getLexSet(OrderingKind comp,
       //        outA - outB <= -1
       //        outA <= outB - 1
       //        outA < outB
-      levelSet.addBound(IntegerPolyhedron::BoundType::UB, subExpr, MPInt(-1));
+      levelSet.addBound(BoundType::UB, subExpr, MPInt(-1));
       break;
     case OrderingKind::GT:
       // For greater than, we add a lower bound of 1:
       //        outA - outB >= 1
       //        outA > outB + 1
       //        outA > outB
-      levelSet.addBound(IntegerPolyhedron::BoundType::LB, subExpr, MPInt(1));
+      levelSet.addBound(BoundType::LB, subExpr, MPInt(1));
       break;
     case OrderingKind::GE:
     case OrderingKind::LE:
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
index d7720a052e0dd..da8f0883d7d5d 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
@@ -445,12 +445,10 @@ static void computeDirectionVector(
   dependenceComponents->resize(numCommonLoops);
   for (unsigned j = 0; j < numCommonLoops; ++j) {
     (*dependenceComponents)[j].op = commonLoops[j].getOperation();
-    auto lbConst =
-        dependenceDomain->getConstantBound64(IntegerPolyhedron::LB, j);
+    auto lbConst = dependenceDomain->getConstantBound64(BoundType::LB, j);
     (*dependenceComponents)[j].lb =
         lbConst.value_or(std::numeric_limits<int64_t>::min());
-    auto ubConst =
-        dependenceDomain->getConstantBound64(IntegerPolyhedron::UB, j);
+    auto ubConst = dependenceDomain->getConstantBound64(BoundType::UB, j);
     (*dependenceComponents)[j].ub =
         ubConst.value_or(std::numeric_limits<int64_t>::max());
   }
diff --git a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
index db4fa354d4c2d..41a739d726ed5 100644
--- a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
@@ -98,7 +98,7 @@ ComputationSliceState::getAsConstraints(FlatAffineValueConstraints *cst) {
     if (isValidSymbol(value)) {
       // Check if the symbol is a constant.
       if (auto cOp = value.getDefiningOp<arith::ConstantIndexOp>())
-        cst->addBound(FlatAffineValueConstraints::EQ, value, cOp.value());
+        cst->addBound(BoundType::EQ, value, cOp.value());
     } else if (auto loop = getForInductionVarOwner(value)) {
       if (failed(cst->addAffineForOpDomain(loop)))
         return failure();
@@ -357,11 +357,11 @@ std::optional<int64_t> MemRefRegion::getConstantBoundingSizeAndShape(
   // that will need non-trivials means to eliminate.
   FlatAffineValueConstraints cstWithShapeBounds(cst);
   for (unsigned r = 0; r < rank; r++) {
-    cstWithShapeBounds.addBound(FlatAffineValueConstraints::LB, r, 0);
+    cstWithShapeBounds.addBound(BoundType::LB, r, 0);
     int64_t dimSize = memRefType.getDimSize(r);
     if (ShapedType::isDynamic(dimSize))
       continue;
-    cstWithShapeBounds.addBound(FlatAffineValueConstraints::UB, r, dimSize - 1);
+    cstWithShapeBounds.addBound(BoundType::UB, r, dimSize - 1);
   }
 
   // Find a constant upper bound on the extent of this memref region along each
@@ -516,7 +516,7 @@ LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth,
       // Check if the symbol is a constant.
       Value symbol = operand;
       if (auto constVal = getConstantIntValue(symbol))
-        cst.addBound(FlatAffineValueConstraints::EQ, symbol, constVal.value());
+        cst.addBound(BoundType::EQ, symbol, constVal.value());
     } else {
       LLVM_DEBUG(llvm::dbgs() << "unknown affine dimensional value");
       return failure();
@@ -580,11 +580,10 @@ LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth,
   if (addMemRefDimBounds) {
     auto memRefType = memref.getType().cast<MemRefType>();
     for (unsigned r = 0; r < rank; r++) {
-      cst.addBound(FlatAffineValueConstraints::LB, /*pos=*/r, /*value=*/0);
+      cst.addBound(BoundType::LB, /*pos=*/r, /*value=*/0);
       if (memRefType.isDynamicDim(r))
         continue;
-      cst.addBound(FlatAffineValueConstraints::UB, /*pos=*/r,
-                   memRefType.getDimSize(r) - 1);
+      cst.addBound(BoundType::UB, /*pos=*/r, memRefType.getDimSize(r) - 1);
     }
   }
   cst.removeTrivialRedundancy();
@@ -695,7 +694,7 @@ LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOp loadOrStoreOp,
       continue;
 
     // Check for overflow: d_i >= memref dim size.
-    ucst.addBound(FlatAffineValueConstraints::LB, r, dimSize);
+    ucst.addBound(BoundType::LB, r, dimSize);
     outOfBounds = !ucst.isEmpty();
     if (outOfBounds && emitError) {
       loadOrStoreOp.emitOpError()
@@ -706,7 +705,7 @@ LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOp loadOrStoreOp,
     FlatAffineValueConstraints lcst(*region.getConstraints());
     std::fill(ineq.begin(), ineq.end(), 0);
     // d_i <= -1;
-    lcst.addBound(FlatAffineValueConstraints::UB, r, -1);
+    lcst.addBound(BoundType::UB, r, -1);
     outOfBounds = !lcst.isEmpty();
     if (outOfBounds && emitError) {
       loadOrStoreOp.emitOpError()
@@ -1403,9 +1402,8 @@ static void unpackOptionalValues(ArrayRef<std::optional<Value>> source,
 /// Note: This function adds a new symbol column to the `constraints` for each
 /// dimension/symbol that exists in the affine map but not in `constraints`.
 static LogicalResult alignAndAddBound(FlatAffineValueConstraints &constraints,
-                                      IntegerPolyhedron::BoundType type,
-                                      unsigned pos, AffineMap map,
-                                      ValueRange operands) {
+                                      BoundType type, unsigned pos,
+                                      AffineMap map, ValueRange operands) {
   SmallVector<Value> dims, syms, newSyms;
   unpackOptionalValues(constraints.getMaybeValues(VarKind::SetDim), dims);
   unpackOptionalValues(constraints.getMaybeValues(VarKind::Symbol), syms);
@@ -1482,7 +1480,7 @@ mlir::simplifyConstrainedMinMaxOp(Operation *op,
 
   // Add an inequality for each result expr_i of map:
   // isMin: op <= expr_i, !isMin: op >= expr_i
-  auto boundType = isMin ? IntegerPolyhedron::UB : IntegerPolyhedron::LB;
+  auto boundType = isMin ? BoundType::UB : BoundType::LB;
   // Upper bounds are exclusive, so add 1. (`affine.min` ops are inclusive.)
   AffineMap mapLbUb = isMin ? addConstToResults(map, 1) : map;
   if (failed(
@@ -1504,8 +1502,7 @@ mlir::simplifyConstrainedMinMaxOp(Operation *op,
   // Add an equality: Set dimOpBound to computed bound.
   // Add back dimension for op. (Was removed by `getSliceBounds`.)
   AffineMap alignedBoundMap = boundMap.shiftDims(/*shift=*/1, /*offset=*/dimOp);
-  if (failed(constraints.addBound(IntegerPolyhedron::EQ, dimOpBound,
-                                  alignedBoundMap)))
+  if (failed(constraints.addBound(BoundType::EQ, dimOpBound, alignedBoundMap)))
     return failure();
 
   // If the constraint system is empty, there is an inconsistency. (E.g., this
@@ -1530,7 +1527,7 @@ mlir::simplifyConstrainedMinMaxOp(Operation *op,
     // Note: These equalities could have been added earlier and used to express
     // minOp <= expr_i. However, then we run the risk that `getSliceBounds`
     // computes minOpUb in terms of r_i dims, which is not desired.
-    if (failed(alignAndAddBound(newConstr, IntegerPolyhedron::EQ, i,
+    if (failed(alignAndAddBound(newConstr, BoundType::EQ, i,
                                 map.getSubMap({i - resultDimStart}), operands)))
       return failure();
 
@@ -1557,7 +1554,7 @@ mlir::simplifyConstrainedMinMaxOp(Operation *op,
     // Skip unused operands and operands that are already constants.
     if (!newOperands[i] || getConstantIntValue(newOperands[i]))
       continue;
-    if (auto bound = constraints.getConstantBound64(IntegerPolyhedron::EQ, i)) {
+    if (auto bound = constraints.getConstantBound64(BoundType::EQ, i)) {
       AffineExpr expr =
           i < newMap.getNumDims()
               ? builder.getAffineDimExpr(i)
diff --git a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
index 99dfaa9dee1d2..999adfad2ab5b 100644
--- a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
+++ b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
@@ -97,11 +97,9 @@ SimplifyBoundedAffineOpsOp::apply(TransformResults &results,
     unsigned pos;
     if (!cstr.findVar(std::get<0>(it), &pos))
       pos = cstr.appendSymbolVar(std::get<0>(it));
-    cstr.addBound(FlatAffineValueConstraints::BoundType::LB, pos,
-                  std::get<1>(it));
+    cstr.addBound(presburger::BoundType::LB, pos, std::get<1>(it));
     // Note: addBound bounds are inclusive, but specified UB is exclusive.
-    cstr.addBound(FlatAffineValueConstraints::BoundType::UB, pos,
-                  std::get<2>(it) - 1);
+    cstr.addBound(presburger::BoundType::UB, pos, std::get<2>(it) - 1);
   }
 
   // Transform all targets.
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 1e567a6db4108..a7f96dc0e08e2 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -2371,8 +2371,8 @@ static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
   for (unsigned d = 0; d < rank; d++) {
     auto dimSize = memRefType.getDimSize(d);
     assert(dimSize > 0 && "filtered dynamic shapes above");
-    regionCst->addBound(IntegerPolyhedron::LB, d, 0);
-    regionCst->addBound(IntegerPolyhedron::UB, d, dimSize - 1);
+    regionCst->addBound(BoundType::LB, d, 0);
+    regionCst->addBound(BoundType::UB, d, dimSize - 1);
   }
   return true;
 }
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 50405953e05bd..d96b688d29ed5 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -1800,8 +1800,8 @@ MemRefType mlir::normalizeMemRefType(MemRefType memrefType,
   for (unsigned d = 0; d < rank; ++d) {
     // Use constraint system only in static dimensions.
     if (shape[d] > 0) {
-      fac.addBound(IntegerPolyhedron::LB, d, 0);
-      fac.addBound(IntegerPolyhedron::UB, d, shape[d] - 1);
+      fac.addBound(BoundType::LB, d, 0);
+      fac.addBound(BoundType::UB, d, shape[d] - 1);
     } else {
       memrefTypeDynDims.emplace_back(d);
     }
@@ -1824,8 +1824,7 @@ MemRefType mlir::normalizeMemRefType(MemRefType memrefType,
       newShape[d] = ShapedType::kDynamic;
     } else {
       // The lower bound for the shape is always zero.
-      std::optional<int64_t> ubConst =
-          fac.getConstantBound64(IntegerPolyhedron::UB, d);
+      std::optional<int64_t> ubConst = fac.getConstantBound64(BoundType::UB, d);
       // For a static memref and an affine map with no symbols, this is
       // always bounded. However, when we have symbols, we may not be able to
       // obtain a constant upper bound. Also, mapping to a negative space is
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index f3879f5dd9d12..75f818b1b275d 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -270,7 +270,7 @@ void getUpperBoundForIndex(Value value, AffineMap &boundMap,
     if (auto applyOp = dyn_cast<AffineApplyOp>(op)) {
       AffineMap map = constraints.computeAlignedMap(applyOp.getAffineMap(),
                                                     applyOp.getOperands());
-      if (failed(constraints.addBound(IntegerPolyhedron::EQ,
+      if (failed(constraints.addBound(BoundType::EQ,
                                       getPosition(applyOp.getResult()), map)))
         return;
       continue;
@@ -279,7 +279,7 @@ void getUpperBoundForIndex(Value value, AffineMap &boundMap,
     auto minOp = cast<AffineMinOp>(op);
     AffineMap map = constraints.computeAlignedMap(minOp.getAffineMap(),
                                                   minOp.getOperands());
-    if (failed(constraints.addBound(IntegerPolyhedron::UB,
+    if (failed(constraints.addBound(BoundType::UB,
                                     getPosition(minOp.getResult()), map,
                                     /*isClosedBound=*/true)))
       return;
@@ -290,8 +290,7 @@ void getUpperBoundForIndex(Value value, AffineMap &boundMap,
   // of the terminals of the index computation.
   unsigned pos = getPosition(value);
   if (constantRequired) {
-    auto ubConst = constraints.getConstantBound64(
-        FlatAffineValueConstraints::BoundType::UB, pos);
+    auto ubConst = constraints.getConstantBound64(BoundType::UB, pos);
     if (!ubConst)
       return;
 
diff --git a/mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp b/mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp
index 6964747cdebb6..1c458eee44d1a 100644
--- a/mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp
@@ -98,9 +98,9 @@ LogicalResult scf::addLoopRangeConstraints(FlatAffineValueConstraints &cstr,
   std::optional<int64_t> lbInt = getConstantIntValue(lb);
   std::optional<int64_t> ubInt = getConstantIntValue(ub);
   if (lbInt)
-    cstr.addBound(IntegerPolyhedron::EQ, symLb, *lbInt);
+    cstr.addBound(BoundType::EQ, symLb, *lbInt);
   if (ubInt)
-    cstr.addBound(IntegerPolyhedron::EQ, symUb, *ubInt);
+    cstr.addBound(BoundType::EQ, symUb, *ubInt);
 
   // Lower bound: iv >= lb (equiv.: iv - lb >= 0)
   SmallVector<int64_t> ineqLb(cstr.getNumCols(), 0);
@@ -131,7 +131,7 @@ LogicalResult scf::addLoopRangeConstraints(FlatAffineValueConstraints &cstr,
       /*dimCount=*/cstr.getNumDimVars(),
       /*symbolCount=*/cstr.getNumSymbolVars(), /*result=*/ivUb);
 
-  return cstr.addBound(IntegerPolyhedron::UB, dimIv, map);
+  return cstr.addBound(BoundType::UB, dimIv, map);
 }
 
 /// Canonicalize min/max operations in the context of for loops with a known
@@ -202,9 +202,9 @@ LogicalResult scf::rewritePeeledMinMaxOp(RewriterBase &rewriter, Operation *op,
   constraints.appendDimVar({iv});
   constraints.appendSymbolVar({ub, step});
   if (auto constUb = getConstantIntValue(ub))
-    constraints.addBound(IntegerPolyhedron::EQ, 1, *constUb);
+    constraints.addBound(BoundType::EQ, 1, *constUb);
   if (auto constStep = getConstantIntValue(step))
-    constraints.addBound(IntegerPolyhedron::EQ, 2, *constStep);
+    constraints.addBound(BoundType::EQ, 2, *constStep);
 
   // Add loop peeling invariant. This is the main piece of knowledge that
   // enables AffineMinOp simplification.
diff --git a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
index cc55b96d5b1a8..6beb9384c8bf2 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
@@ -594,12 +594,12 @@ TEST(IntegerPolyhedronTest, removeRedundantConstraintsTest) {
 
 TEST(IntegerPolyhedronTest, addConstantUpperBound) {
   IntegerPolyhedron poly(PresburgerSpace::getSetSpace(2));
-  poly.addBound(IntegerPolyhedron::UB, 0, 1);
+  poly.addBound(BoundType::UB, 0, 1);
   EXPECT_EQ(poly.atIneq(0, 0), -1);
   EXPECT_EQ(poly.atIneq(0, 1), 0);
   EXPECT_EQ(poly.atIneq(0, 2), 1);
 
-  poly.addBound(IntegerPolyhedron::UB, {1, 2, 3}, 1);
+  poly.addBound(BoundType::UB, {1, 2, 3}, 1);
   EXPECT_EQ(poly.atIneq(1, 0), -1);
   EXPECT_EQ(poly.atIneq(1, 1), -2);
   EXPECT_EQ(poly.atIneq(1, 2), -2);
@@ -607,12 +607,12 @@ TEST(IntegerPolyhedronTest, addConstantUpperBound) {
 
 TEST(IntegerPolyhedronTest, addConstantLowerBound) {
   IntegerPolyhedron poly(PresburgerSpace::getSetSpace(2));
-  poly.addBound(IntegerPolyhedron::LB, 0, 1);
+  poly.addBound(BoundType::LB, 0, 1);
   EXPECT_EQ(poly.atIneq(0, 0), 1);
   EXPECT_EQ(poly.atIneq(0, 1), 0);
   EXPECT_EQ(poly.atIneq(0, 2), -1);
 
-  poly.addBound(IntegerPolyhedron::LB, {1, 2, 3}, 1);
+  poly.addBound(BoundType::LB, {1, 2, 3}, 1);
   EXPECT_EQ(poly.atIneq(1, 0), 1);
   EXPECT_EQ(poly.atIneq(1, 1), 2);
   EXPECT_EQ(poly.atIneq(1, 2), 2);

From 0691bcb18024a28e82e8dd9a08ab0820b40c9a37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 20 Mar 2023 17:43:26 +0100
Subject: [PATCH 081/208] [clang][Interp][NFC] Add tests for __fp16

Differential Revision: https://reviews.llvm.org/D146436
---
 clang/test/AST/Interp/floats.cpp | 91 ++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/clang/test/AST/Interp/floats.cpp b/clang/test/AST/Interp/floats.cpp
index 7b9328c4d1182..b3c4dd4c19a84 100644
--- a/clang/test/AST/Interp/floats.cpp
+++ b/clang/test/AST/Interp/floats.cpp
@@ -78,3 +78,94 @@ namespace compound {
   }
   static_assert(f2() == __FLT_MAX__, "");
 }
+
+
+namespace FP16 {
+  constexpr int i = 2;
+  constexpr __fp16 f = 1.0f;
+  static_assert(f == 1.0f, "");
+
+  constexpr __fp16 f2 = 1u * f;
+  static_assert(f2 == 1.0f, "");
+
+  constexpr __fp16 f3 = 1.5;
+  constexpr int i3 = f3;
+  static_assert(i3 == 1, "");
+
+  constexpr bool b3 = f3;
+  static_assert(b3, "");
+
+
+  static_assert(1.0f16 + 3u == 4, "");
+  static_assert(4.0f16 / 1.0f16 == 4, "");
+  static_assert(10.0f16 * false == 0, "");
+
+  constexpr __fp16 __fp16s[] = {1.0f16, 2.0f16, 3.0f16, 4.0f16};
+
+  constexpr __fp16 m = 5.0f16 / 0.0f16; // ref-error {{must be initialized by a constant expression}} \
+                                   // ref-note {{division by zero}} \
+                                   // expected-error {{must be initialized by a constant expression}} \
+                                   // expected-note {{division by zero}}
+
+  static_assert(~2.0f16 == 3, ""); // ref-error {{invalid argument type '_Float16' to unary expression}} \
+                                 // expected-error {{invalid argument type '_Float16' to unary expression}}
+
+  /// Initialized by a double.
+  constexpr __fp16 df = 0.0;
+  /// The other way around.
+  constexpr double fd = 0.0f16;
+
+  static_assert(0.0f == -0.0f, "");
+
+  const int k = 3 * (1.0f16 / 3.0f16);
+  static_assert(k == 1, "");
+
+  constexpr bool b = 1.0f16;
+  static_assert(b, "");
+
+  constexpr double db = true;
+  static_assert(db == 1.0f16, "");
+
+  constexpr __fp16 fa[] = {1.0f, 2.0, 1, false};
+  constexpr double da[] = {1.0f, 2.0, 1, false};
+
+  constexpr __fp16 fm = __FLT16_MAX__;
+  constexpr int someInt = fm;
+
+  constexpr float SomeFloat = __FLT_MAX__;
+  constexpr __fp16 halfFloat = SomeFloat;
+
+  constexpr float fp16ptr() {
+    __fp16 f1 = 1.0f16;
+    __fp16 *f2 = &f1;
+
+    *f2 = 3.0;
+    return f1;
+  }
+  static_assert(fp16ptr() == 3.0, "");
+
+  namespace compound {
+    constexpr float f1() {
+      __fp16 f = 0;
+      f += 3.0;
+      f -= 3.0f;
+
+      f += 1;
+      f /= 1;
+      f /= 1.0;
+      f *= f;
+
+      f *= 2.0;
+      return f;
+    }
+    static_assert(f1() == 2, "");
+
+    constexpr float f2() {
+      __fp16 f = __FLT16_MAX__;
+      f += 1.0;
+      return f;
+    }
+    static_assert(f2() == __FLT16_MAX__, "");
+  }
+
+}

From 49dcd08c3d963e79d0710faf0e4024eb9b84bc8b Mon Sep 17 00:00:00 2001
From: esmeyi <esme.yi@ibm.com>
Date: Thu, 23 Mar 2023 05:09:47 -0400
Subject: [PATCH 082/208] [XCOFF] support the ref directive for object
 generation.

Summary: A R_REF relocation as a non-relocating reference is required to prevent garbage collection (by the binder) of the ref symbol in object generation.

Reviewed By: shchenz

Differential Revision: https://reviews.llvm.org/D144356
---
 llvm/include/llvm/MC/MCStreamer.h             |  2 +-
 llvm/include/llvm/MC/MCXCOFFStreamer.h        |  5 +-
 llvm/lib/MC/MCAsmStreamer.cpp                 |  7 +--
 llvm/lib/MC/MCStreamer.cpp                    |  2 +-
 llvm/lib/MC/MCXCOFFStreamer.cpp               | 15 ++++++
 llvm/lib/MC/XCOFFObjectWriter.cpp             |  5 +-
 .../PowerPC/MCTargetDesc/PPCAsmBackend.cpp    |  9 ++++
 .../PowerPC/MCTargetDesc/PPCFixupKinds.h      |  3 +-
 .../MCTargetDesc/PPCXCOFFObjectWriter.cpp     |  6 +++
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     | 18 +++++---
 .../test/CodeGen/PowerPC/pgo-ref-directive.ll | 46 +++++++++++++++++--
 11 files changed, 98 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index aa39954d62868..f5891b24ae4b4 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -645,7 +645,7 @@ class MCStreamer {
   /// relocation table for one or more symbols.
   ///
   /// \param Sym - The symbol on the .ref directive.
-  virtual void emitXCOFFRefDirective(StringRef Sym);
+  virtual void emitXCOFFRefDirective(const MCSymbol *Symbol);
 
   /// Emit an ELF .size directive.
   ///
diff --git a/llvm/include/llvm/MC/MCXCOFFStreamer.h b/llvm/include/llvm/MC/MCXCOFFStreamer.h
index a437faeccbff4..aea2a3265d572 100644
--- a/llvm/include/llvm/MC/MCXCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCXCOFFStreamer.h
@@ -31,10 +31,7 @@ class MCXCOFFStreamer : public MCObjectStreamer {
   void emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol,
                                             MCSymbolAttr Linkage,
                                             MCSymbolAttr Visibility) override;
-  void emitXCOFFRefDirective(StringRef Name) override {
-    report_fatal_error("emitXCOFFRefDirective is not implemented yet on object"
-                       "generation path");
-  }
+  void emitXCOFFRefDirective(const MCSymbol *Symbol) override;
   void emitXCOFFRenameDirective(const MCSymbol *Name,
                                 StringRef Rename) override {
     report_fatal_error("emitXCOFFRenameDirective is not implemented yet on "
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 006f697b61875..fa1ab2717af2e 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -194,7 +194,7 @@ class MCAsmStreamer final : public MCStreamer {
   void emitXCOFFRenameDirective(const MCSymbol *Name,
                                 StringRef Rename) override;
 
-  void emitXCOFFRefDirective(StringRef Name) override;
+  void emitXCOFFRefDirective(const MCSymbol *Symbol) override;
 
   void emitXCOFFExceptDirective(const MCSymbol *Symbol, 
                                 const MCSymbol *Trap,
@@ -943,8 +943,9 @@ void MCAsmStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
   EmitEOL();
 }
 
-void MCAsmStreamer::emitXCOFFRefDirective(StringRef Name) {
-  OS << "\t.ref " << Name;
+void MCAsmStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
+  OS << "\t.ref ";
+  Symbol->print(OS, MAI);
   EmitEOL();
 }
 
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 517e258844156..4dd3163fd399d 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1190,7 +1190,7 @@ void MCStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
                    "XCOFF targets");
 }
 
-void MCStreamer::emitXCOFFRefDirective(StringRef Name) {
+void MCStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
   llvm_unreachable("emitXCOFFRefDirective is only supported on XCOFF targets");
 }
 
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 25a678c68416e..d8ac07bc85b1b 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -81,6 +81,21 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
   emitSymbolAttribute(Symbol, Visibility);
 }
 
+void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
+  // Add a Fixup here to later record a relocation of type R_REF to prevent the
+  // ref symbol from being garbage collected (by the binder).
+  MCDataFragment *DF = getOrCreateDataFragment();
+  const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
+  std::optional<MCFixupKind> MaybeKind =
+      getAssembler().getBackend().getFixupKind("R_REF");
+  if (!MaybeKind)
+    report_fatal_error("failed to get fixup kind for R_REF relocation");
+
+  MCFixupKind Kind = *MaybeKind;
+  MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, Kind);
+  DF->getFixups().push_back(Fixup);
+}
+
 void MCXCOFFStreamer::emitXCOFFExceptDirective(const MCSymbol *Symbol,
                                                const MCSymbol *Trap,
                                                unsigned Lang, unsigned Reason,
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index c79bdeb2cac4c..6452050d5941e 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -663,7 +663,10 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
     // instr address plus any constant value.
     FixedValue =
         SectionMap[SymASec]->Address - BRInstrAddress + Target.getConstant();
-  }
+  } else if (Type == XCOFF::RelocationType::R_REF)
+    // The FixedValue should always be 0 since it specifies a nonrelocating
+    // reference.
+    FixedValue = 0;
 
   assert((Fixup.getOffset() <=
           MaxRawDataSize - Layout.getFragmentOffset(Fragment)) &&
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 227bd59ba3a64..a814bb1b4c07e 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -238,6 +238,8 @@ class XCOFFPPCAsmBackend : public PPCAsmBackend {
   createObjectTargetWriter() const override {
     return createPPCXCOFFObjectWriter(TT.isArch64Bit());
   }
+
+  std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 };
 
 } // end anonymous namespace
@@ -272,6 +274,13 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const {
   return std::nullopt;
 }
 
+std::optional<MCFixupKind>
+XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const {
+  return StringSwitch<std::optional<MCFixupKind>>(Name)
+      .Case("R_REF", (MCFixupKind)PPC::fixup_ppc_nofixup)
+      .Default(std::nullopt);
+}
+
 MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
                                         const MCSubtargetInfo &STI,
                                         const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index df0c666f5b113..9e8ee9f23107b 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -48,7 +48,8 @@ enum Fixups {
 
   /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
   /// TLS general and local dynamic models, or inserts the thread-pointer
-  /// register number.
+  /// register number. It can also be used to tie the ref symbol to prevent it
+  /// from being garbage collected on AIX.
   fixup_ppc_nofixup,
 
   /// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index 729cb35cbebcf..b6e749b781804 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -90,6 +90,12 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
     return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25};
   case PPC::fixup_ppc_br24abs:
     return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
+  case PPC::fixup_ppc_nofixup: {
+    if (Modifier == MCSymbolRefExpr::VK_None)
+      return {XCOFF::RelocationType::R_REF, 0};
+    else
+      llvm_unreachable("Unsupported Modifier");
+  } break;
   case FK_Data_4:
   case FK_Data_8:
     const uint8_t SignAndSizeForFKData =
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 1ecaeabacf9f7..7c6fd3b85b1eb 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2517,16 +2517,22 @@ void PPCAIXAsmPrinter::emitPGORefs() {
     OutStreamer->switchSection(CntsSection);
     if (OutContext.hasXCOFFSection(
             "__llvm_prf_data",
-            XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD)))
-      OutStreamer->emitXCOFFRefDirective("__llvm_prf_data[RW]");
+            XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD))) {
+      MCSymbol *S = OutContext.getOrCreateSymbol("__llvm_prf_data[RW]");
+      OutStreamer->emitXCOFFRefDirective(S);
+    }
     if (OutContext.hasXCOFFSection(
             "__llvm_prf_names",
-            XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD)))
-      OutStreamer->emitXCOFFRefDirective("__llvm_prf_names[RO]");
+            XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD))) {
+      MCSymbol *S = OutContext.getOrCreateSymbol("__llvm_prf_names[RO]");
+      OutStreamer->emitXCOFFRefDirective(S);
+    }
     if (OutContext.hasXCOFFSection(
             "__llvm_prf_vnds",
-            XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD)))
-      OutStreamer->emitXCOFFRefDirective("__llvm_prf_vnds[RW]");
+            XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD))) {
+      MCSymbol *S = OutContext.getOrCreateSymbol("__llvm_prf_vnds[RW]");
+      OutStreamer->emitXCOFFRefDirective(S);
+    }
   }
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/pgo-ref-directive.ll b/llvm/test/CodeGen/PowerPC/pgo-ref-directive.ll
index 172affa4a2661..201af2f949618 100644
--- a/llvm/test/CodeGen/PowerPC/pgo-ref-directive.ll
+++ b/llvm/test/CodeGen/PowerPC/pgo-ref-directive.ll
@@ -1,9 +1,22 @@
 ; RUN: rm -rf %t && split-file %s %t
 
-; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff -xcoff-traceback-table=false < %t/no-ref.ll | FileCheck %s --check-prefixes=NOREF
-; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff -xcoff-traceback-table=false < %t/no-vnds.ll | FileCheck %s --check-prefixes=NOVNDS
-; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff -xcoff-traceback-table=false < %t/with-vnds.ll | FileCheck %s --check-prefixes=WITHVNDS
+; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \
+; RUN:     -xcoff-traceback-table=false < %t/no-ref.ll | FileCheck %s --check-prefixes=NOREF
+; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \
+; RUN:     -xcoff-traceback-table=false --filetype=obj < %t/no-ref.ll -o %t/no-ref.o
+; RUN: llvm-objdump %t/no-ref.o -r | FileCheck %s --check-prefix=NOREF-OBJ
 
+; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \
+; RUN:     -xcoff-traceback-table=false < %t/no-vnds.ll | FileCheck %s --check-prefixes=NOVNDS
+; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \
+; RUN:     -xcoff-traceback-table=false --filetype=obj < %t/no-vnds.ll -o %t/no-vnds.o
+; RUN: llvm-objdump %t/no-vnds.o -r | FileCheck %s --check-prefix=NOVNDS-OBJ
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \
+; RUN:     -xcoff-traceback-table=false < %t/with-vnds.ll | FileCheck %s --check-prefixes=WITHVNDS
+; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \
+; RUN:     -xcoff-traceback-table=false --filetype=obj < %t/with-vnds.ll -o %t/with-vnds.o
+; RUN: llvm-objdump %t/with-vnds.o -tr | FileCheck %s --check-prefix=WITHVNDS-OBJ
 
 ;--- no-ref.ll
 ; The absence of a __llvm_prf_cnts section should stop generating the .refs.
@@ -27,6 +40,10 @@ entry:
 ; NOREF-NOT:  .ref __llvm_prf_names
 ; NOREF-NOT:  .ref __llvm_prf_vnds
 
+; NOREF-OBJ-NOT: R_REF  __llvm_prf_data
+; NOREF-OBJ-NOT: R_REF  __llvm_prf_names
+; NOREF-OBJ-NOT: R_REF  __llvm_prf_vnds
+
 ;--- no-vnds.ll
 ; This is the most common case. When -fprofile-generate is used and there exists executable code, we generate the __llvm_prf_cnts, __llvm_prf_data, and __llvm_prf_names sections.
 ;
@@ -56,6 +73,10 @@ entry:
 ; NOVNDS-NEXT: .ref __llvm_prf_names[RO]
 ; NOVNDS-NOT:  .ref __llvm_prf_vnds
 
+; NOVNDS-OBJ: 00000008 R_REF  __llvm_prf_data
+; NOVNDS-OBJ: 00000008 R_REF  __llvm_prf_names
+; NOVNDS-OBJ-NOT: R_REF  __llvm_prf_vnds
+
 ;--- with-vnds.ll
 ; When value profiling is needed, the PGO instrumentation generates variables in the __llvm_prf_vnds section, so we generate a .ref for them too.
 ;
@@ -80,3 +101,22 @@ entry:
 ; WITHVNDS-NEXT: .ref __llvm_prf_data[RW]
 ; WITHVNDS-NEXT: .ref __llvm_prf_names[RO]
 ; WITHVNDS-NEXT: .ref __llvm_prf_vnds[RW]
+
+; WITHVNDS-OBJ:      SYMBOL TABLE:
+; WITHVNDS-OBJ-NEXT: 00000000      df *DEBUG*	00000000 <stdin>
+; WITHVNDS-OBJ-NEXT: 00000000 l       .text	00000008 .text
+; WITHVNDS-OBJ-NEXT: 00000000 g     F .text (csect: .text) 	00000000 .main
+; WITHVNDS-OBJ-NEXT: 00000008 l       .text	00000006 __llvm_prf_names
+; WITHVNDS-OBJ-NEXT: 00000010 l     O .data	00000008 __llvm_prf_cnts
+; WITHVNDS-OBJ-NEXT: 00000018 l     O .data	00000008 __llvm_prf_data
+; WITHVNDS-OBJ-NEXT: 00000020 l     O .data	000000f0 __llvm_prf_vnds
+; WITHVNDS-OBJ-NEXT: 00000110 g     O .data	0000000c main
+; WITHVNDS-OBJ-NEXT: 0000011c l       .data	00000000 TOC
+
+; WITHVNDS-OBJ:      RELOCATION RECORDS FOR [.data]:
+; WITHVNDS-OBJ-NEXT: OFFSET   TYPE                     VALUE
+; WITHVNDS-OBJ-NEXT: 00000008 R_REF                    __llvm_prf_data
+; WITHVNDS-OBJ-NEXT: 00000008 R_REF                    __llvm_prf_names
+; WITHVNDS-OBJ-NEXT: 00000008 R_REF                    __llvm_prf_vnds
+; WITHVNDS-OBJ-NEXT: 00000100 R_POS                    .main
+; WITHVNDS-OBJ-NEXT: 00000104 R_POS                    TOC

From 8c10256734cd47274671fcabe94f24f15ecd6209 Mon Sep 17 00:00:00 2001
From: MarcoFalke <*~=`'#}+{/-|&$^_@721217.xyz>
Date: Tue, 14 Mar 2023 14:23:07 +0100
Subject: [PATCH 083/208] clang-tidy: Detect use-after-move in
 CXXCtorInitializer

Fixes https://github.com/llvm/llvm-project/issues/51844

Differential Revision: https://reviews.llvm.org/D146288
---
 .../clang-tidy/bugprone/UseAfterMoveCheck.cpp |  82 ++++++++----
 clang-tools-extra/docs/ReleaseNotes.rst       |   4 +
 .../checkers/bugprone/use-after-move.cpp      | 126 ++++++++++++++++++
 3 files changed, 184 insertions(+), 28 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
index b7eadb87b4fcd..c10c3652a153a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
@@ -58,11 +58,11 @@ class UseAfterMoveFinder {
 public:
   UseAfterMoveFinder(ASTContext *TheContext);
 
-  // Within the given function body, finds the first use of 'MovedVariable' that
+  // Within the given code block, finds the first use of 'MovedVariable' that
   // occurs after 'MovingCall' (the expression that performs the move). If a
   // use-after-move is found, writes information about it to 'TheUseAfterMove'.
   // Returns whether a use-after-move was found.
-  bool find(Stmt *FunctionBody, const Expr *MovingCall,
+  bool find(Stmt *CodeBlock, const Expr *MovingCall,
             const ValueDecl *MovedVariable, UseAfterMove *TheUseAfterMove);
 
 private:
@@ -104,7 +104,7 @@ static StatementMatcher inDecltypeOrTemplateArg() {
 UseAfterMoveFinder::UseAfterMoveFinder(ASTContext *TheContext)
     : Context(TheContext) {}
 
-bool UseAfterMoveFinder::find(Stmt *FunctionBody, const Expr *MovingCall,
+bool UseAfterMoveFinder::find(Stmt *CodeBlock, const Expr *MovingCall,
                               const ValueDecl *MovedVariable,
                               UseAfterMove *TheUseAfterMove) {
   // Generate the CFG manually instead of through an AnalysisDeclContext because
@@ -118,12 +118,11 @@ bool UseAfterMoveFinder::find(Stmt *FunctionBody, const Expr *MovingCall,
   Options.AddImplicitDtors = true;
   Options.AddTemporaryDtors = true;
   std::unique_ptr<CFG> TheCFG =
-      CFG::buildCFG(nullptr, FunctionBody, Context, Options);
+      CFG::buildCFG(nullptr, CodeBlock, Context, Options);
   if (!TheCFG)
     return false;
 
-  Sequence =
-      std::make_unique<ExprSequence>(TheCFG.get(), FunctionBody, Context);
+  Sequence = std::make_unique<ExprSequence>(TheCFG.get(), CodeBlock, Context);
   BlockMap = std::make_unique<StmtToBlockMap>(TheCFG.get(), Context);
   Visited.clear();
 
@@ -398,20 +397,28 @@ static void emitDiagnostic(const Expr *MovingCall, const DeclRefExpr *MoveArg,
 }
 
 void UseAfterMoveCheck::registerMatchers(MatchFinder *Finder) {
+  // try_emplace is a common maybe-moving function that returns a
+  // bool to tell callers whether it moved. Ignore std::move inside
+  // try_emplace to avoid false positives as we don't track uses of
+  // the bool.
+  auto TryEmplaceMatcher =
+      cxxMemberCallExpr(callee(cxxMethodDecl(hasName("try_emplace"))));
   auto CallMoveMatcher =
-      callExpr(callee(functionDecl(hasName("::std::move"))), argumentCountIs(1),
+      callExpr(argumentCountIs(1), callee(functionDecl(hasName("::std::move"))),
                hasArgument(0, declRefExpr().bind("arg")),
+               unless(inDecltypeOrTemplateArg()),
+               unless(hasParent(TryEmplaceMatcher)), expr().bind("call-move"),
                anyOf(hasAncestor(compoundStmt(
                          hasParent(lambdaExpr().bind("containing-lambda")))),
-                     hasAncestor(functionDecl().bind("containing-func"))),
-               unless(inDecltypeOrTemplateArg()),
-               // try_emplace is a common maybe-moving function that returns a
-               // bool to tell callers whether it moved. Ignore std::move inside
-               // try_emplace to avoid false positives as we don't track uses of
-               // the bool.
-               unless(hasParent(cxxMemberCallExpr(
-                   callee(cxxMethodDecl(hasName("try_emplace")))))))
-          .bind("call-move");
+                     hasAncestor(functionDecl(anyOf(
+                         cxxConstructorDecl(
+                             hasAnyConstructorInitializer(withInitializer(
+                                 expr(anyOf(equalsBoundNode("call-move"),
+                                            hasDescendant(expr(
+                                                equalsBoundNode("call-move")))))
+                                     .bind("containing-ctor-init"))))
+                             .bind("containing-ctor"),
+                         functionDecl().bind("containing-func"))))));
 
   Finder->addMatcher(
       traverse(
@@ -434,6 +441,10 @@ void UseAfterMoveCheck::registerMatchers(MatchFinder *Finder) {
 }
 
 void UseAfterMoveCheck::check(const MatchFinder::MatchResult &Result) {
+  const auto *ContainingCtor =
+      Result.Nodes.getNodeAs<CXXConstructorDecl>("containing-ctor");
+  const auto *ContainingCtorInit =
+      Result.Nodes.getNodeAs<Expr>("containing-ctor-init");
   const auto *ContainingLambda =
       Result.Nodes.getNodeAs<LambdaExpr>("containing-lambda");
   const auto *ContainingFunc =
@@ -445,23 +456,38 @@ void UseAfterMoveCheck::check(const MatchFinder::MatchResult &Result) {
   if (!MovingCall || !MovingCall->getExprLoc().isValid())
     MovingCall = CallMove;
 
-  Stmt *FunctionBody = nullptr;
-  if (ContainingLambda)
-    FunctionBody = ContainingLambda->getBody();
-  else if (ContainingFunc)
-    FunctionBody = ContainingFunc->getBody();
-  else
-    return;
-
   // Ignore the std::move if the variable that was passed to it isn't a local
   // variable.
   if (!Arg->getDecl()->getDeclContext()->isFunctionOrMethod())
     return;
 
-  UseAfterMoveFinder Finder(Result.Context);
-  UseAfterMove Use;
-  if (Finder.find(FunctionBody, MovingCall, Arg->getDecl(), &Use))
-    emitDiagnostic(MovingCall, Arg, Use, this, Result.Context);
+  // Collect all code blocks that could use the arg after move.
+  llvm::SmallVector<Stmt *> CodeBlocks{};
+  if (ContainingCtor) {
+    CodeBlocks.push_back(ContainingCtor->getBody());
+    if (ContainingCtorInit) {
+      // Collect the constructor initializer expressions.
+      bool BeforeMove{true};
+      for (CXXCtorInitializer *Init : ContainingCtor->inits()) {
+        if (BeforeMove && Init->getInit()->IgnoreImplicit() ==
+                              ContainingCtorInit->IgnoreImplicit())
+          BeforeMove = false;
+        if (!BeforeMove)
+          CodeBlocks.push_back(Init->getInit());
+      }
+    }
+  } else if (ContainingLambda) {
+    CodeBlocks.push_back(ContainingLambda->getBody());
+  } else if (ContainingFunc) {
+    CodeBlocks.push_back(ContainingFunc->getBody());
+  }
+
+  for (Stmt *CodeBlock : CodeBlocks) {
+    UseAfterMoveFinder Finder(Result.Context);
+    UseAfterMove Use;
+    if (Finder.find(CodeBlock, MovingCall, Arg->getDecl(), &Use))
+      emitDiagnostic(MovingCall, Arg, Use, this, Result.Context);
+  }
 }
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 80f5b46681713..89419141cebbd 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -162,6 +162,10 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/suspicious-include>` check.
   Global options of the same name should be used instead.
 
+- Improved :doc:`bugprone-use-after-move
+  <clang-tidy/checks/bugprone/use-after-move>` check to also cover constructor
+  initializers.
+
 - Deprecated check-local options `HeaderFileExtensions`
   in :doc:`google-build-namespaces
   <clang-tidy/checks/google/build-namespaces>` check.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/use-after-move.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/use-after-move.cpp
index 45cef8abfd1f6..1e0831048dbd4 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/use-after-move.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/use-after-move.cpp
@@ -369,6 +369,18 @@ void lambdas() {
     };
     a.foo();
   }
+  // Don't warn if 'a' is a copy inside a synchronous lambda
+  {
+    A a;
+    A copied{[a] mutable { return std::move(a); }()};
+    a.foo();
+  }
+  // False negative (should warn if 'a' is a ref inside a synchronous lambda)
+  {
+    A a;
+    A moved{[&a] mutable { return std::move(a); }()};
+    a.foo();
+  }
   // Warn if the use consists of a capture that happens after a move.
   {
     A a;
@@ -1367,6 +1379,120 @@ void typeId() {
 }
 } // namespace UnevalContext
 
+class CtorInit {
+public:
+  CtorInit(std::string val)
+      : a{val.empty()},    // fine
+        s{std::move(val)},
+        b{val.empty()}
+  // CHECK-NOTES: [[@LINE-1]]:11: warning: 'val' used after it was moved
+  // CHECK-NOTES: [[@LINE-3]]:9: note: move occurred here
+  {}
+
+private:
+  bool a;
+  std::string s;
+  bool b;
+};
+
+class CtorInitLambda {
+public:
+  CtorInitLambda(std::string val)
+      : a{val.empty()},    // fine
+        s{std::move(val)},
+        b{[&] { return val.empty(); }()},
+        // CHECK-NOTES: [[@LINE-1]]:12: warning: 'val' used after it was moved
+        // CHECK-NOTES: [[@LINE-3]]:9: note: move occurred here
+        c{[] {
+          std::string str{};
+          std::move(str);
+          return str.empty();
+          // CHECK-NOTES: [[@LINE-1]]:18: warning: 'str' used after it was moved
+          // CHECK-NOTES: [[@LINE-3]]:11: note: move occurred here
+        }()} {
+    std::move(val);
+    // CHECK-NOTES: [[@LINE-1]]:15: warning: 'val' used after it was moved
+    // CHECK-NOTES: [[@LINE-13]]:9: note: move occurred here
+    std::string val2{};
+    std::move(val2);
+    val2.empty();
+    // CHECK-NOTES: [[@LINE-1]]:5: warning: 'val2' used after it was moved
+    // CHECK-NOTES: [[@LINE-3]]:5: note: move occurred here
+  }
+
+private:
+  bool a;
+  std::string s;
+  bool b;
+  bool c;
+  bool d{};
+};
+
+class CtorInitOrder {
+public:
+  CtorInitOrder(std::string val)
+      : a{val.empty()}, // fine
+        b{val.empty()},
+        // CHECK-NOTES: [[@LINE-1]]:11: warning: 'val' used after it was moved
+        s{std::move(val)} {} // wrong order
+  // CHECK-NOTES: [[@LINE-1]]:9: note: move occurred here
+  // CHECK-NOTES: [[@LINE-4]]:11: note: the use happens in a later loop iteration than the move
+
+private:
+  bool a;
+  std::string s;
+  bool b;
+};
+
+struct Obj {};
+struct CtorD {
+  CtorD(Obj b);
+};
+
+struct CtorC {
+  CtorC(Obj b);
+};
+
+struct CtorB {
+  CtorB(Obj &b);
+};
+
+struct CtorA : CtorB, CtorC, CtorD {
+  CtorA(Obj b) : CtorB{b}, CtorC{std::move(b)}, CtorD{b} {}
+  // CHECK-NOTES: [[@LINE-1]]:55: warning: 'b' used after it was moved
+  // CHECK-NOTES: [[@LINE-2]]:34: note: move occurred here
+};
+
+struct Base {
+  Base(Obj b) : bb{std::move(b)} {}
+  template <typename Call> Base(Call &&c) : bb{c()} {};
+
+  Obj bb;
+};
+
+struct Derived : Base, CtorC {
+  Derived(Obj b)
+      : Base{[&] mutable { return std::move(b); }()},
+        // False negative: The lambda/std::move was executed, so it should warn
+        // below
+        CtorC{b} {}
+};
+
+struct Derived2 : Base, CtorC {
+  Derived2(Obj b)
+      : Base{[&] mutable { return std::move(b); }},
+        // This was a move, but it doesn't warn below, because it can't know if
+        // the lambda/std::move was actually called
+        CtorC{b} {}
+};
+
+struct Derived3 : Base, CtorC {
+  Derived3(Obj b)
+      : Base{[c = std::move(b)] mutable { return std::move(c); }}, CtorC{b} {}
+  // CHECK-NOTES: [[@LINE-1]]:74: warning: 'b' used after it was moved
+  // CHECK-NOTES: [[@LINE-2]]:19: note: move occurred here
+};
+
 class PR38187 {
 public:
   PR38187(std::string val) : val_(std::move(val)) {

From 814177e434d8daf70a3d67345c166d40457f68f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 23 Mar 2023 10:33:03 +0100
Subject: [PATCH 084/208] Revert "[clang][Interp][NFC] Add tests for __fp16"

This reverts commit 0691bcb18024a28e82e8dd9a08ab0820b40c9a37.

Looks like this breaks builders, e.g.
https://lab.llvm.org/buildbot#builders/231/builds/9790
---
 clang/test/AST/Interp/floats.cpp | 91 --------------------------------
 1 file changed, 91 deletions(-)

diff --git a/clang/test/AST/Interp/floats.cpp b/clang/test/AST/Interp/floats.cpp
index b3c4dd4c19a84..7b9328c4d1182 100644
--- a/clang/test/AST/Interp/floats.cpp
+++ b/clang/test/AST/Interp/floats.cpp
@@ -78,94 +78,3 @@ namespace compound {
   }
   static_assert(f2() == __FLT_MAX__, "");
 }
-
-
-namespace FP16 {
-  constexpr int i = 2;
-  constexpr __fp16 f = 1.0f;
-  static_assert(f == 1.0f, "");
-
-  constexpr __fp16 f2 = 1u * f;
-  static_assert(f2 == 1.0f, "");
-
-  constexpr __fp16 f3 = 1.5;
-  constexpr int i3 = f3;
-  static_assert(i3 == 1, "");
-
-  constexpr bool b3 = f3;
-  static_assert(b3, "");
-
-
-  static_assert(1.0f16 + 3u == 4, "");
-  static_assert(4.0f16 / 1.0f16 == 4, "");
-  static_assert(10.0f16 * false == 0, "");
-
-  constexpr __fp16 __fp16s[] = {1.0f16, 2.0f16, 3.0f16, 4.0f16};
-
-  constexpr __fp16 m = 5.0f16 / 0.0f16; // ref-error {{must be initialized by a constant expression}} \
-                                   // ref-note {{division by zero}} \
-                                   // expected-error {{must be initialized by a constant expression}} \
-                                   // expected-note {{division by zero}}
-
-  static_assert(~2.0f16 == 3, ""); // ref-error {{invalid argument type '_Float16' to unary expression}} \
-                                 // expected-error {{invalid argument type '_Float16' to unary expression}}
-
-  /// Initialized by a double.
-  constexpr __fp16 df = 0.0;
-  /// The other way around.
-  constexpr double fd = 0.0f16;
-
-  static_assert(0.0f == -0.0f, "");
-
-  const int k = 3 * (1.0f16 / 3.0f16);
-  static_assert(k == 1, "");
-
-  constexpr bool b = 1.0f16;
-  static_assert(b, "");
-
-  constexpr double db = true;
-  static_assert(db == 1.0f16, "");
-
-  constexpr __fp16 fa[] = {1.0f, 2.0, 1, false};
-  constexpr double da[] = {1.0f, 2.0, 1, false};
-
-  constexpr __fp16 fm = __FLT16_MAX__;
-  constexpr int someInt = fm;
-
-  constexpr float SomeFloat = __FLT_MAX__;
-  constexpr __fp16 halfFloat = SomeFloat;
-
-  constexpr float fp16ptr() {
-    __fp16 f1 = 1.0f16;
-    __fp16 *f2 = &f1;
-
-    *f2 = 3.0;
-    return f1;
-  }
-  static_assert(fp16ptr() == 3.0, "");
-
-  namespace compound {
-    constexpr float f1() {
-      __fp16 f = 0;
-      f += 3.0;
-      f -= 3.0f;
-
-      f += 1;
-      f /= 1;
-      f /= 1.0;
-      f *= f;
-
-      f *= 2.0;
-      return f;
-    }
-    static_assert(f1() == 2, "");
-
-    constexpr float f2() {
-      __fp16 f = __FLT16_MAX__;
-      f += 1.0;
-      return f;
-    }
-    static_assert(f2() == __FLT16_MAX__, "");
-  }
-
-}

From 25466efb532f2255c86ffa721a126e5e5c0edc18 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 23 Mar 2023 10:34:29 +0100
Subject: [PATCH 085/208] [gn] Port e655d8a54880 more

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index bab71dfafae19..8372efe72f7eb 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -554,7 +554,6 @@ if (current_toolchain == default_toolchain) {
       "__mutex/mutex.h",
       "__mutex/tag_types.h",
       "__mutex/unique_lock.h",
-      "__mutex_base",
       "__node_handle",
       "__numeric/accumulate.h",
       "__numeric/adjacent_difference.h",

From 61b0a492ccb1ab62c8c8aa1b154ce4c7d69a6da2 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 22 Mar 2023 10:05:48 +0000
Subject: [PATCH 086/208] [llvm][ARM] Refactor isMnemonicVPTPredicable

Fixes #61607

Several names were repeated in this giant list. I have refactored
it and removed the duplicates.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D146619
---
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 104 +++++++-----------
 1 file changed, 37 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index e0bc431fd4bd8..6cbb7120e2667 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12887,71 +12887,41 @@ bool ARMAsmParser::isMnemonicVPTPredicable(StringRef Mnemonic,
   if (!hasMVE())
     return false;
 
-  return Mnemonic.startswith("vabav") || Mnemonic.startswith("vaddv") ||
-         Mnemonic.startswith("vaddlv") || Mnemonic.startswith("vminnmv") ||
-         Mnemonic.startswith("vminnmav") || Mnemonic.startswith("vminv") ||
-         Mnemonic.startswith("vminav") || Mnemonic.startswith("vmaxnmv") ||
-         Mnemonic.startswith("vmaxnmav") || Mnemonic.startswith("vmaxv") ||
-         Mnemonic.startswith("vmaxav") || Mnemonic.startswith("vmladav") ||
-         Mnemonic.startswith("vrmlaldavh") || Mnemonic.startswith("vrmlalvh") ||
-         Mnemonic.startswith("vmlsdav") || Mnemonic.startswith("vmlav") ||
-         Mnemonic.startswith("vmlaldav") || Mnemonic.startswith("vmlalv") ||
-         Mnemonic.startswith("vmaxnm") || Mnemonic.startswith("vminnm") ||
-         Mnemonic.startswith("vmax") || Mnemonic.startswith("vmin") ||
-         Mnemonic.startswith("vshlc") || Mnemonic.startswith("vmovlt") ||
-         Mnemonic.startswith("vmovlb") || Mnemonic.startswith("vshll") ||
-         Mnemonic.startswith("vrshrn") || Mnemonic.startswith("vshrn") ||
-         Mnemonic.startswith("vqrshrun") || Mnemonic.startswith("vqshrun") ||
-         Mnemonic.startswith("vqrshrn") || Mnemonic.startswith("vqshrn") ||
-         Mnemonic.startswith("vbic") || Mnemonic.startswith("vrev64") ||
-         Mnemonic.startswith("vrev32") || Mnemonic.startswith("vrev16") ||
-         Mnemonic.startswith("vmvn") || Mnemonic.startswith("veor") ||
-         Mnemonic.startswith("vorn") || Mnemonic.startswith("vorr") ||
-         Mnemonic.startswith("vand") || Mnemonic.startswith("vmul") ||
-         Mnemonic.startswith("vqrdmulh") || Mnemonic.startswith("vqdmulh") ||
-         Mnemonic.startswith("vsub") || Mnemonic.startswith("vadd") ||
-         Mnemonic.startswith("vqsub") || Mnemonic.startswith("vqadd") ||
-         Mnemonic.startswith("vabd") || Mnemonic.startswith("vrhadd") ||
-         Mnemonic.startswith("vhsub") || Mnemonic.startswith("vhadd") ||
-         Mnemonic.startswith("vdup") || Mnemonic.startswith("vcls") ||
-         Mnemonic.startswith("vclz") || Mnemonic.startswith("vneg") ||
-         Mnemonic.startswith("vabs") || Mnemonic.startswith("vqneg") ||
-         Mnemonic.startswith("vqabs") ||
-         (Mnemonic.startswith("vrint") && Mnemonic != "vrintr") ||
-         Mnemonic.startswith("vcmla") || Mnemonic.startswith("vfma") ||
-         Mnemonic.startswith("vfms") || Mnemonic.startswith("vcadd") ||
-         Mnemonic.startswith("vadd") || Mnemonic.startswith("vsub") ||
-         Mnemonic.startswith("vshl") || Mnemonic.startswith("vqshl") ||
-         Mnemonic.startswith("vqrshl") || Mnemonic.startswith("vrshl") ||
-         Mnemonic.startswith("vsri") || Mnemonic.startswith("vsli") ||
-         Mnemonic.startswith("vrshr") || Mnemonic.startswith("vshr") ||
-         Mnemonic.startswith("vpsel") || Mnemonic.startswith("vcmp") ||
-         Mnemonic.startswith("vqdmladh") || Mnemonic.startswith("vqrdmladh") ||
-         Mnemonic.startswith("vqdmlsdh") || Mnemonic.startswith("vqrdmlsdh") ||
-         Mnemonic.startswith("vcmul") || Mnemonic.startswith("vrmulh") ||
-         Mnemonic.startswith("vqmovn") || Mnemonic.startswith("vqmovun") ||
-         Mnemonic.startswith("vmovnt") || Mnemonic.startswith("vmovnb") ||
-         Mnemonic.startswith("vmaxa") || Mnemonic.startswith("vmaxnma") ||
-         Mnemonic.startswith("vhcadd") || Mnemonic.startswith("vadc") ||
-         Mnemonic.startswith("vsbc") || Mnemonic.startswith("vrshr") ||
-         Mnemonic.startswith("vshr") || Mnemonic.startswith("vstrb") ||
-         Mnemonic.startswith("vldrb") ||
-         (Mnemonic.startswith("vstrh") && Mnemonic != "vstrhi") ||
-         (Mnemonic.startswith("vldrh") && Mnemonic != "vldrhi") ||
-         Mnemonic.startswith("vstrw") || Mnemonic.startswith("vldrw") ||
-         Mnemonic.startswith("vldrd") || Mnemonic.startswith("vstrd") ||
-         Mnemonic.startswith("vqdmull") || Mnemonic.startswith("vbrsr") ||
-         Mnemonic.startswith("vfmas") || Mnemonic.startswith("vmlas") ||
-         Mnemonic.startswith("vmla") || Mnemonic.startswith("vqdmlash") ||
-         Mnemonic.startswith("vqdmlah") || Mnemonic.startswith("vqrdmlash") ||
-         Mnemonic.startswith("vqrdmlah") || Mnemonic.startswith("viwdup") ||
-         Mnemonic.startswith("vdwdup") || Mnemonic.startswith("vidup") ||
-         Mnemonic.startswith("vddup") || Mnemonic.startswith("vctp") ||
-         Mnemonic.startswith("vpnot") || Mnemonic.startswith("vbic") ||
-         Mnemonic.startswith("vrmlsldavh") || Mnemonic.startswith("vmlsldav") ||
-         Mnemonic.startswith("vcvt") ||
-         MS.isVPTPredicableCDEInstr(Mnemonic) ||
-         (Mnemonic.startswith("vmov") &&
-          !(ExtraToken == ".f16" || ExtraToken == ".32" ||
-            ExtraToken == ".16" || ExtraToken == ".8"));
+  if (MS.isVPTPredicableCDEInstr(Mnemonic) ||
+      (Mnemonic.startswith("vldrh") && Mnemonic != "vldrhi") ||
+      (Mnemonic.startswith("vmov") &&
+       !(ExtraToken == ".f16" || ExtraToken == ".32" || ExtraToken == ".16" ||
+         ExtraToken == ".8")) ||
+      (Mnemonic.startswith("vrint") && Mnemonic != "vrintr") ||
+      (Mnemonic.startswith("vstrh") && Mnemonic != "vstrhi"))
+    return true;
+
+  const char *predicable_prefixes[] = {
+      "vabav",      "vabd",     "vabs",      "vadc",       "vadd",
+      "vaddlv",     "vaddv",    "vand",      "vbic",       "vbrsr",
+      "vcadd",      "vcls",     "vclz",      "vcmla",      "vcmp",
+      "vcmul",      "vctp",     "vcvt",      "vddup",      "vdup",
+      "vdwdup",     "veor",     "vfma",      "vfmas",      "vfms",
+      "vhadd",      "vhcadd",   "vhsub",     "vidup",      "viwdup",
+      "vldrb",      "vldrd",    "vldrw",     "vmax",       "vmaxa",
+      "vmaxav",     "vmaxnm",   "vmaxnma",   "vmaxnmav",   "vmaxnmv",
+      "vmaxv",      "vmin",     "vminav",    "vminnm",     "vminnmav",
+      "vminnmv",    "vminv",    "vmla",      "vmladav",    "vmlaldav",
+      "vmlalv",     "vmlas",    "vmlav",     "vmlsdav",    "vmlsldav",
+      "vmovlb",     "vmovlt",   "vmovnb",    "vmovnt",     "vmul",
+      "vmvn",       "vneg",     "vorn",      "vorr",       "vpnot",
+      "vpsel",      "vqabs",    "vqadd",     "vqdmladh",   "vqdmlah",
+      "vqdmlash",   "vqdmlsdh", "vqdmulh",   "vqdmull",    "vqmovn",
+      "vqmovun",    "vqneg",    "vqrdmladh", "vqrdmlah",   "vqrdmlash",
+      "vqrdmlsdh",  "vqrdmulh", "vqrshl",    "vqrshrn",    "vqrshrun",
+      "vqshl",      "vqshrn",   "vqshrun",   "vqsub",      "vrev16",
+      "vrev32",     "vrev64",   "vrhadd",    "vrmlaldavh", "vrmlalvh",
+      "vrmlsldavh", "vrmulh",   "vrshl",     "vrshr",      "vrshrn",
+      "vsbc",       "vshl",     "vshlc",     "vshll",      "vshr",
+      "vshrn",      "vsli",     "vsri",      "vstrb",      "vstrd",
+      "vstrw",      "vsub"};
+
+  return std::any_of(
+      std::begin(predicable_prefixes), std::end(predicable_prefixes),
+      [&Mnemonic](const char *prefix) { return Mnemonic.startswith(prefix); });
 }

From 4f17d75b24ee3f75fb9755461c04a73ed2f018f8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Mar 2023 09:42:45 +0000
Subject: [PATCH 087/208] [X86] LowerVectorAllZero - early out if the type size
 is not pow2. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6cf359d6d217a..e828fe4b9dd15 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24167,6 +24167,10 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
     return SDValue();
   }
 
+  // Quit if not convertable to legal scalar or 128/256-bit vector.
+  if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
+    return SDValue();
+
   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
 
@@ -24188,10 +24192,6 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
                        DAG.getConstant(0, DL, IntVT));
   }
 
-  // Quit if not splittable to 128/256-bit vector.
-  if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
-    return SDValue();
-
   // Split down to 128/256-bit vector.
   unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
   while (VT.getSizeInBits() > TestSize) {

From 7a5b95732ade6c2de69b26f1038aa0a5afc39393 Mon Sep 17 00:00:00 2001
From: mydeveloperday <mydeveloperday@gmail.com>
Date: Thu, 23 Mar 2023 09:45:00 +0000
Subject: [PATCH 088/208] [clang-format] NFC Format.h and
 ClangFormatStyleOptions.rst are out of date

Regenerate the style documentation, requires some minor sphinx changes to avoid warnings

Differential Revision: https://reviews.llvm.org/D146704
---
 clang/docs/ClangFormatStyleOptions.rst | 89 ++++++++++++++++++++++++++
 clang/include/clang/Format/Format.h    | 48 +++++++++++---
 2 files changed, 127 insertions(+), 10 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index fd8f2bbb54322..37500d7bff52d 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -3642,6 +3642,95 @@ the configuration (without a prefix: ``Auto``).
 **MacroBlockEnd** (``String``) :versionbadge:`clang-format 3.7` :ref:`¶ <MacroBlockEnd>`
   A regular expression matching macros that end a block.
 
+.. _Macros:
+<<<<<<<
+=======
+
+**Macros** (``List of Strings``) :versionbadge:`clang-format 17.0` :ref:`¶ <Macros>`
+  A list of macros of the form ``<definition>=<expansion>`` .
+
+  Code will be parsed with macros expanded, in order to determine how to
+  interpret and format the macro arguments.
+
+  For example, the code:
+
+  .. code-block:: c++
+
+    A(a*b);
+
+  will usually be interpreted as a call to a function A, and the
+  multiplication expression will be formatted as `a * b`.
+
+  If we specify the macro definition:
+
+  .. code-block:: yaml
+
+    Macros:
+    - A(x)=x
+
+  the code will now be parsed as a declaration of the variable b of type a*,
+  and formatted as `a* b` (depending on pointer-binding rules).
+
+  Features and restrictions:
+   * Both function-like macros and object-like macros are supported.
+   * Macro arguments must be used exactly once in the expansion.
+   * No recursive expansion; macros referencing other macros will be
+     ignored.
+   * Overloading by arity is supported: for example, given the macro
+     definitions A=x, A()=y, A(a)=a:
+
+
+  .. code-block:: c++
+
+     A; -> x;
+     A(); -> y;
+     A(z); -> z;
+     A(a, b); // will not be expanded.
+
+.. _MaxEmptyLinesToKeep:
+>>>>>>>
+
+**Macros** (``List of Strings``) :versionbadge:`clang-format 17.0` :ref:`¶ <Macros>`
+  A list of macros of the form ``<definition>=<expansion>`` .
+
+  Code will be parsed with macros expanded, in order to determine how to
+  interpret and format the macro arguments.
+
+  For example, the code:
+
+  .. code-block:: c++
+
+    A(a*b);
+
+  will usually be interpreted as a call to a function A, and the
+  multiplication expression will be formatted as `a * b`.
+
+  If we specify the macro definition:
+
+  .. code-block:: yaml
+
+    Macros:
+    - A(x)=x
+
+  the code will now be parsed as a declaration of the variable b of type a*,
+  and formatted as `a* b` (depending on pointer-binding rules).
+
+  Features and restrictions:
+   * Both function-like macros and object-like macros are supported.
+   * Macro arguments must be used exactly once in the expansion.
+   * No recursive expansion; macros referencing other macros will be
+     ignored.
+   * Overloading by arity is supported: for example, given the macro
+     definitions A=x, A()=y, A(a)=a:
+
+
+  .. code-block:: c++
+
+     A; -> x;
+     A(); -> y;
+     A(z); -> z;
+     A(a, b); // will not be expanded.
+
 .. _MaxEmptyLinesToKeep:
 
 **MaxEmptyLinesToKeep** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ <MaxEmptyLinesToKeep>`
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 66904a6a11232..e2709cca3967f 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -2754,28 +2754,56 @@ struct FormatStyle {
   /// \code
   ///   A(a*b);
   /// \endcode
+  ///
   /// will usually be interpreted as a call to a function A, and the
   /// multiplication expression will be formatted as `a * b`.
   ///
   /// If we specify the macro definition:
-  /// \code
+  /// \code{.yaml}
   ///   Macros:
   ///   - A(x)=x
   /// \endcode
+  ///
   /// the code will now be parsed as a declaration of the variable b of type a*,
   /// and formatted as `a* b` (depending on pointer-binding rules).
   ///
   /// Features and restrictions:
-  /// *  Both function-like macros and object-like macros are supported.
-  /// *  Macro arguments must be used exactly once in the expansion.
-  /// *  No recursive expansion; macros referencing other macros will be
+<<<<<<<
+=======
+  ///  * Both function-like macros and object-like macros are supported.
+  ///  * Macro arguments must be used exactly once in the expansion.
+  ///  * No recursive expansion; macros referencing other macros will be
+  ///    ignored.
+  ///  * Overloading by arity is supported: for example, given the macro
+  ///    definitions A=x, A()=y, A(a)=a:
+  ///
+  /// \code
+  ///    A; -> x;
+  ///    A(); -> y;
+  ///    A(z); -> z;
+  ///    A(a, b); // will not be expanded.
+  /// \endcode
+  ///
+  /// \version 17.0
+  std::vector<std::string> Macros;
+
+  /// The maximum number of consecutive empty lines to keep.
+>>>>>>>
+  ///  * Both function-like macros and object-like macros are supported.
+  ///  * Macro arguments must be used exactly once in the expansion.
+  ///  * No recursive expansion; macros referencing other macros will be
   ///    ignored.
-  /// *  Overloading by arity is supported: for example, given the macro
-  ///    definitions A=x, A()=y, A(a)=a,
-  ///    'A;' -> 'x;'
-  ///    'A();' -> 'y;'
-  ///    'A(z);' -> 'z;'
-  ///    'A(a, b) will not be expanded.
+  ///  * Overloading by arity is supported: for example, given the macro
+  ///    definitions A=x, A()=y, A(a)=a:
+  ///
+  /// \code
+  ///    A; -> x;
+  ///    A(); -> y;
+  ///    A(z); -> z;
+  ///    A(a, b); // will not be expanded.
+  /// \endcode
+  ///
+  /// \version 17.0
   std::vector<std::string> Macros;
 
   /// The maximum number of consecutive empty lines to keep.

From 7c928205c1f5a972f1f4dbeae83bd979c9a617d7 Mon Sep 17 00:00:00 2001
From: mydeveloperday <mydeveloperday@gmail.com>
Date: Thu, 23 Mar 2023 09:52:59 +0000
Subject: [PATCH 089/208] Revert "[clang-format] NFC Format.h and
 ClangFormatStyleOptions.rst are out of date"

This reverts commit 7a5b95732ade6c2de69b26f1038aa0a5afc39393.
---
 clang/docs/ClangFormatStyleOptions.rst | 89 --------------------------
 clang/include/clang/Format/Format.h    | 48 +++-----------
 2 files changed, 10 insertions(+), 127 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 37500d7bff52d..fd8f2bbb54322 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -3642,95 +3642,6 @@ the configuration (without a prefix: ``Auto``).
 **MacroBlockEnd** (``String``) :versionbadge:`clang-format 3.7` :ref:`¶ <MacroBlockEnd>`
   A regular expression matching macros that end a block.
 
-.. _Macros:
-<<<<<<<
-=======
-
-**Macros** (``List of Strings``) :versionbadge:`clang-format 17.0` :ref:`¶ <Macros>`
-  A list of macros of the form ``<definition>=<expansion>`` .
-
-  Code will be parsed with macros expanded, in order to determine how to
-  interpret and format the macro arguments.
-
-  For example, the code:
-
-  .. code-block:: c++
-
-    A(a*b);
-
-  will usually be interpreted as a call to a function A, and the
-  multiplication expression will be formatted as `a * b`.
-
-  If we specify the macro definition:
-
-  .. code-block:: yaml
-
-    Macros:
-    - A(x)=x
-
-  the code will now be parsed as a declaration of the variable b of type a*,
-  and formatted as `a* b` (depending on pointer-binding rules).
-
-  Features and restrictions:
-   * Both function-like macros and object-like macros are supported.
-   * Macro arguments must be used exactly once in the expansion.
-   * No recursive expansion; macros referencing other macros will be
-     ignored.
-   * Overloading by arity is supported: for example, given the macro
-     definitions A=x, A()=y, A(a)=a:
-
-
-  .. code-block:: c++
-
-     A; -> x;
-     A(); -> y;
-     A(z); -> z;
-     A(a, b); // will not be expanded.
-
-.. _MaxEmptyLinesToKeep:
->>>>>>>
-
-**Macros** (``List of Strings``) :versionbadge:`clang-format 17.0` :ref:`¶ <Macros>`
-  A list of macros of the form ``<definition>=<expansion>`` .
-
-  Code will be parsed with macros expanded, in order to determine how to
-  interpret and format the macro arguments.
-
-  For example, the code:
-
-  .. code-block:: c++
-
-    A(a*b);
-
-  will usually be interpreted as a call to a function A, and the
-  multiplication expression will be formatted as `a * b`.
-
-  If we specify the macro definition:
-
-  .. code-block:: yaml
-
-    Macros:
-    - A(x)=x
-
-  the code will now be parsed as a declaration of the variable b of type a*,
-  and formatted as `a* b` (depending on pointer-binding rules).
-
-  Features and restrictions:
-   * Both function-like macros and object-like macros are supported.
-   * Macro arguments must be used exactly once in the expansion.
-   * No recursive expansion; macros referencing other macros will be
-     ignored.
-   * Overloading by arity is supported: for example, given the macro
-     definitions A=x, A()=y, A(a)=a:
-
-
-  .. code-block:: c++
-
-     A; -> x;
-     A(); -> y;
-     A(z); -> z;
-     A(a, b); // will not be expanded.
-
 .. _MaxEmptyLinesToKeep:
 
 **MaxEmptyLinesToKeep** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ <MaxEmptyLinesToKeep>`
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index e2709cca3967f..66904a6a11232 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -2754,56 +2754,28 @@ struct FormatStyle {
   /// \code
   ///   A(a*b);
   /// \endcode
-  ///
   /// will usually be interpreted as a call to a function A, and the
   /// multiplication expression will be formatted as `a * b`.
   ///
   /// If we specify the macro definition:
-  /// \code{.yaml}
+  /// \code
   ///   Macros:
   ///   - A(x)=x
   /// \endcode
-  ///
   /// the code will now be parsed as a declaration of the variable b of type a*,
   /// and formatted as `a* b` (depending on pointer-binding rules).
   ///
   /// Features and restrictions:
-<<<<<<<
-=======
-  ///  * Both function-like macros and object-like macros are supported.
-  ///  * Macro arguments must be used exactly once in the expansion.
-  ///  * No recursive expansion; macros referencing other macros will be
-  ///    ignored.
-  ///  * Overloading by arity is supported: for example, given the macro
-  ///    definitions A=x, A()=y, A(a)=a:
-  ///
-  /// \code
-  ///    A; -> x;
-  ///    A(); -> y;
-  ///    A(z); -> z;
-  ///    A(a, b); // will not be expanded.
-  /// \endcode
-  ///
-  /// \version 17.0
-  std::vector<std::string> Macros;
-
-  /// The maximum number of consecutive empty lines to keep.
->>>>>>>
-  ///  * Both function-like macros and object-like macros are supported.
-  ///  * Macro arguments must be used exactly once in the expansion.
-  ///  * No recursive expansion; macros referencing other macros will be
+  /// *  Both function-like macros and object-like macros are supported.
+  /// *  Macro arguments must be used exactly once in the expansion.
+  /// *  No recursive expansion; macros referencing other macros will be
   ///    ignored.
-  ///  * Overloading by arity is supported: for example, given the macro
-  ///    definitions A=x, A()=y, A(a)=a:
-  ///
-  /// \code
-  ///    A; -> x;
-  ///    A(); -> y;
-  ///    A(z); -> z;
-  ///    A(a, b); // will not be expanded.
-  /// \endcode
-  ///
-  /// \version 17.0
+  /// *  Overloading by arity is supported: for example, given the macro
+  ///    definitions A=x, A()=y, A(a)=a,
+  ///    'A;' -> 'x;'
+  ///    'A();' -> 'y;'
+  ///    'A(z);' -> 'z;'
+  ///    'A(a, b) will not be expanded.
   std::vector<std::string> Macros;
 
   /// The maximum number of consecutive empty lines to keep.

From 26d954bd4004dd01771308a5061a865073993130 Mon Sep 17 00:00:00 2001
From: Yevgeny Rouban <yrouban@azul.com>
Date: Thu, 23 Mar 2023 16:19:19 +0700
Subject: [PATCH 090/208] [AsmParser] Avoid instantiating LLVMContext if not
 needed. Try 2.

The deleted copy constructor LLVMContext(LLVMContext &) got its
parameter changed to const to allow the latest clang compiler to
instantiatiate template std::optional<LLVMContext>.

Differential Revision: https://reviews.llvm.org/D142699
---
 llvm/include/llvm/IR/LLVMContext.h | 2 +-
 llvm/lib/AsmParser/Parser.cpp      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index a8095a5c2fd30..c13a783e86c7a 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -68,7 +68,7 @@ class LLVMContext {
 public:
   LLVMContextImpl *const pImpl;
   LLVMContext();
-  LLVMContext(LLVMContext &) = delete;
+  LLVMContext(const LLVMContext &) = delete;
   LLVMContext &operator=(const LLVMContext &) = delete;
   ~LLVMContext();
 
diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index 035eea81378e5..eded892f358a8 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -28,9 +28,9 @@ static bool parseAssemblyInto(MemoryBufferRef F, Module *M,
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F);
   SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
 
-  LLVMContext Context;
+  std::optional<LLVMContext> OptContext;
   return LLParser(F.getBuffer(), SM, Err, M, Index,
-                  M ? M->getContext() : Context, Slots)
+                  M ? M->getContext() : OptContext.emplace(), Slots)
       .Run(UpgradeDebugInfo, DataLayoutCallback);
 }
 

From d25751779baa37356265b004edc7e55ee4a4c383 Mon Sep 17 00:00:00 2001
From: Job Noorman <jnoorman@igalia.com>
Date: Thu, 23 Mar 2023 17:45:16 +0800
Subject: [PATCH 091/208] Bump RV32E version to 2.0

RV32E was recently [ratified](https://github.com/riscv/riscv-isa-manual/commit/afd613691cb89ccd7584206e8a6d1866fe77ec88) so we should update the version as our MC-layer support is complete.

Reviewed By: kito-cheng

Differential Revision: https://reviews.llvm.org/D144384
---
 llvm/lib/Support/RISCVISAInfo.cpp   | 2 +-
 llvm/test/MC/RISCV/attribute-arch.s | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 603b1f3d64737..93cf66ff1f739 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -41,7 +41,7 @@ static constexpr StringLiteral AllStdExts = "mafdqlcbkjtpvnh";
 
 static const RISCVSupportedExtension SupportedExtensions[] = {
     {"i", RISCVExtensionVersion{2, 0}},
-    {"e", RISCVExtensionVersion{1, 9}},
+    {"e", RISCVExtensionVersion{2, 0}},
     {"m", RISCVExtensionVersion{2, 0}},
     {"a", RISCVExtensionVersion{2, 0}},
     {"f", RISCVExtensionVersion{2, 0}},
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 7856dcf94cd1c..af0b3fe0cdc29 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -12,6 +12,9 @@
 .attribute arch, "rv32i2p0"
 # CHECK: attribute      5, "rv32i2p0"
 
+.attribute arch, "rv32e"
+# CHECK: attribute      5, "rv32e2p0"
+
 .attribute arch, "rv32i2_m2"
 # CHECK: attribute      5, "rv32i2p0_m2p0"
 

From c2de8ff92753acdb1ace7a27cc11cb09f28eb8fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Thu, 23 Mar 2023 11:10:39 +0100
Subject: [PATCH 092/208] [JITLink] Initial AArch32 backend

This first version lays the foundations for AArch32 support in JITLink. ELFLinkGraphBuilder_aarch32 processes REL-type relocations and populates LinkGraphs from ELF object files for both big- and little-endian systems. The ArmCfg member controls subarchitecture-specific details throughout the linking process (i.e. it's passed to ELFJITLinker_aarch32).

Relocation types follow the ABI documentation's division into classes: Data (endian-sensitive), Arm (32-bit little-endian) and Thumb (2x 16-bit little-endian, "Thumb32" in the docs). The implementation of instruction encoding/decoding for relocation resolution is implemented symmetrically and is testable in isolation (see AArch32 category in JITLinkTests).

Callable Thumb functions are marked with a ThumbSymbol target-flag and stored in the LinkGraph with their real addresses. The thumb-bit is added back in when the owning JITDylib requests the address for such a symbol.

The StubsManager can generate (absolute) Thumb-state stubs for branch range extensions on v7+ targets. Proper GOT/PLT handling is not yet implemented.

This patch is based on the backend implementation in ez-clang and has just enough functionality to model the infrastructure and link a Thumb function `main()` that calls `printf()` to dump "Hello Arm!" on Armv7a. It was tested on Raspberry Pi with 32-bit Raspbian OS.

Reviewed By: lhames

Differential Revision: https://reviews.llvm.org/D144083
---
 .../ExecutionEngine/JITLink/ELF_aarch32.h     |  38 ++
 .../llvm/ExecutionEngine/JITLink/aarch32.h    | 293 ++++++++++
 .../ExecutionEngine/JITLink/CMakeLists.txt    |   2 +
 llvm/lib/ExecutionEngine/JITLink/ELF.cpp      |   9 +
 .../JITLink/ELFLinkGraphBuilder.h             |  21 +
 .../ExecutionEngine/JITLink/ELF_aarch32.cpp   | 299 ++++++++++
 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp  | 514 ++++++++++++++++++
 .../Orc/ObjectLinkingLayer.cpp                |   6 +-
 .../JITLink/AArch32/ELF_thumbv7_printf.s      |  46 ++
 .../JITLink/AArch32/lit.local.cfg             |   2 +
 .../ExecutionEngine/JITLink/AArch32Tests.cpp  | 200 +++++++
 .../ExecutionEngine/JITLink/CMakeLists.txt    |   1 +
 12 files changed, 1430 insertions(+), 1 deletion(-)
 create mode 100644 llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch32.h
 create mode 100644 llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
 create mode 100644 llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
 create mode 100644 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_thumbv7_printf.s
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/lit.local.cfg
 create mode 100644 llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch32.h
new file mode 100644
index 0000000000000..25d1c3aac2c26
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch32.h
@@ -0,0 +1,38 @@
+//===---- ELF_aarch32.h - JIT link functions for arm/thumb -----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// jit-link functions for ELF/aarch32.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH32
+#define LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH32
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/JITLink/aarch32.h"
+
+namespace llvm {
+namespace jitlink {
+
+/// Create a LinkGraph from an ELF/arm relocatable object
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject_aarch32(MemoryBufferRef ObjectBuffer);
+
+/// jit-link the given object buffer, which must be an ELF arm/thumb object
+/// file.
+void link_ELF_aarch32(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx);
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH32
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
new file mode 100644
index 0000000000000..8488b10278771
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -0,0 +1,293 @@
+//===------ aarch32.h - Generic JITLink arm/thumb utilities -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing arm/thumb objects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_AARCH32
+#define LLVM_EXECUTIONENGINE_JITLINK_AARCH32
+
+#include "TableManager.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace jitlink {
+namespace aarch32 {
+
+/// JITLink-internal AArch32 fixup kinds
+enum EdgeKind_aarch32 : Edge::Kind {
+
+  ///
+  /// Relocations of class Data
+  ///
+  FirstDataRelocation = Edge::FirstRelocation,
+
+  /// Plain 32-bit value relocation in target endianness
+  Data_Delta32 = FirstDataRelocation,
+
+  LastDataRelocation = Data_Delta32,
+
+  ///
+  /// Relocations of class Arm (covers fixed-width 4-byte instruction subset)
+  ///
+  FirstArmRelocation,
+
+  /// TODO: Arm_Call is here only as a placeholder for now.
+  Arm_Call = FirstArmRelocation,
+
+  LastArmRelocation = Arm_Call,
+
+  ///
+  /// Relocations of class Thumb16 and Thumb32 (covers Thumb instruction subset)
+  ///
+  FirstThumbRelocation,
+
+  /// Write immediate value for PC-relative branch with link (can bridge between
+  /// Arm and Thumb).
+  Thumb_Call = FirstThumbRelocation,
+
+  /// Write immediate value for (unconditional) PC-relative branch without link.
+  Thumb_Jump24,
+
+  /// Write immediate value to the lower halfword of the destination register
+  Thumb_MovwAbsNC,
+
+  /// Write immediate value to the top halfword of the destination register
+  Thumb_MovtAbs,
+
+  LastThumbRelocation = Thumb_MovtAbs,
+};
+
+/// Flags enum for AArch32-specific symbol properties
+enum TargetFlags_aarch32 : TargetFlagsType {
+  ThumbSymbol = 1 << 0,
+};
+
+/// Human-readable name for a given CPU architecture kind
+const char *getCPUArchName(ARMBuildAttrs::CPUArch K);
+
+/// Get a human-readable name for the given AArch32 edge kind.
+const char *getEdgeKindName(Edge::Kind K);
+
+/// AArch32 uses stubs for a number of purposes, like branch range extension
+/// or interworking between Arm and Thumb instruction subsets.
+///
+/// Stub implementations vary depending on CPU architecture (v4, v6, v7),
+/// instruction subset and branch type (absolute/PC-relative).
+///
+/// For each kind of stub, the StubsFlavor defines one concrete form that is
+/// used throughout the LinkGraph.
+///
+/// Stubs are often called "veneers" in the official docs and online.
+///
+enum StubsFlavor {
+  Unsupported = 0,
+  Thumbv7,
+};
+
+/// JITLink sub-arch configuration for Arm CPU models
+struct ArmConfig {
+  bool J1J2BranchEncoding = false;
+  StubsFlavor Stubs = Unsupported;
+};
+
+/// Obtain the sub-arch configuration for a given Arm CPU model.
+inline ArmConfig getArmConfigForCPUArch(ARMBuildAttrs::CPUArch CPUArch) {
+  ArmConfig ArmCfg;
+  switch (CPUArch) {
+  case ARMBuildAttrs::v7:
+  case ARMBuildAttrs::v8_A:
+    ArmCfg.J1J2BranchEncoding = true;
+    ArmCfg.Stubs = Thumbv7;
+    break;
+  default:
+    DEBUG_WITH_TYPE("jitlink", {
+      dbgs() << "  Warning: ARM config not defined for CPU architecture "
+             << getCPUArchName(CPUArch);
+    });
+    break;
+  }
+  return ArmCfg;
+}
+
+/// Immutable pair of halfwords, Hi and Lo, with overflow check
+struct HalfWords {
+  constexpr HalfWords() : Hi(0), Lo(0) {}
+  constexpr HalfWords(uint32_t Hi, uint32_t Lo) : Hi(Hi), Lo(Lo) {
+    assert(isUInt<16>(Hi) && "Overflow in first half-word");
+    assert(isUInt<16>(Lo) && "Overflow in second half-word");
+  }
+  const uint16_t Hi; // First halfword
+  const uint16_t Lo; // Second halfword
+};
+
+/// Collection of named constants per fixup kind. It may contain but is not
+/// limited to the following entries:
+///
+///   Opcode      - Values of the op-code bits in the instruction, with
+///                 unaffected bits nulled
+///   OpcodeMask  - Mask with all bits set that encode the op-code
+///   ImmMask     - Mask with all bits set that encode the immediate value
+///   RegMask     - Mask with all bits set that encode the register
+///
+template <EdgeKind_aarch32 Kind> struct FixupInfo {};
+
+template <> struct FixupInfo<Thumb_Jump24> {
+  static constexpr HalfWords Opcode{0xf000, 0x8000};
+  static constexpr HalfWords OpcodeMask{0xf800, 0x8000};
+  static constexpr HalfWords ImmMask{0x07ff, 0x2fff};
+  static constexpr uint16_t LoBitConditional = 0x1000;
+};
+
+template <> struct FixupInfo<Thumb_Call> {
+  static constexpr HalfWords Opcode{0xf000, 0xc000};
+  static constexpr HalfWords OpcodeMask{0xf800, 0xc000};
+  static constexpr HalfWords ImmMask{0x07ff, 0x2fff};
+  static constexpr uint16_t LoBitH = 0x0001;
+  static constexpr uint16_t LoBitNoBlx = 0x1000;
+};
+
+template <> struct FixupInfo<Thumb_MovtAbs> {
+  static constexpr HalfWords Opcode{0xf2c0, 0x0000};
+  static constexpr HalfWords OpcodeMask{0xfbf0, 0x8000};
+  static constexpr HalfWords ImmMask{0x040f, 0x70ff};
+  static constexpr HalfWords RegMask{0x0000, 0x0f00};
+};
+
+template <>
+struct FixupInfo<Thumb_MovwAbsNC> : public FixupInfo<Thumb_MovtAbs> {
+  static constexpr HalfWords Opcode{0xf240, 0x0000};
+};
+
+/// Helper function to read the initial addend for Data-class relocations.
+Expected<int64_t> readAddendData(LinkGraph &G, Block &B, const Edge &E);
+
+/// Helper function to read the initial addend for Arm-class relocations.
+Expected<int64_t> readAddendArm(LinkGraph &G, Block &B, const Edge &E);
+
+/// Helper function to read the initial addend for Thumb-class relocations.
+Expected<int64_t> readAddendThumb(LinkGraph &G, Block &B, const Edge &E,
+                                  const ArmConfig &ArmCfg);
+
+/// Read the initial addend for a REL-type relocation. It's the value encoded
+/// in the immediate field of the fixup location by the compiler.
+inline Expected<int64_t> readAddend(LinkGraph &G, Block &B, const Edge &E,
+                                    const ArmConfig &ArmCfg) {
+  Edge::Kind Kind = E.getKind();
+  if (Kind <= LastDataRelocation)
+    return readAddendData(G, B, E);
+
+  if (Kind <= LastArmRelocation)
+    return readAddendArm(G, B, E);
+
+  if (Kind <= LastThumbRelocation)
+    return readAddendThumb(G, B, E, ArmCfg);
+
+  llvm_unreachable("Relocation must be of class Data, Arm or Thumb");
+}
+
+/// Helper function to apply the fixup for Data-class relocations.
+Error applyFixupData(LinkGraph &G, Block &B, const Edge &E);
+
+/// Helper function to apply the fixup for Arm-class relocations.
+Error applyFixupArm(LinkGraph &G, Block &B, const Edge &E);
+
+/// Helper function to apply the fixup for Thumb-class relocations.
+Error applyFixupThumb(LinkGraph &G, Block &B, const Edge &E,
+                      const ArmConfig &ArmCfg);
+
+/// Apply fixup expression for edge to block content.
+inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
+                        const ArmConfig &ArmCfg) {
+  Edge::Kind Kind = E.getKind();
+
+  if (Kind <= LastDataRelocation)
+    return applyFixupData(G, B, E);
+
+  if (Kind <= LastArmRelocation)
+    return applyFixupArm(G, B, E);
+
+  if (Kind <= LastThumbRelocation)
+    return applyFixupThumb(G, B, E, ArmCfg);
+
+  llvm_unreachable("Relocation must be of class Data, Arm or Thumb");
+}
+
+/// Stubs builder for a specific StubsFlavor
+///
+/// Right now we only have one default stub kind, but we want to extend this
+/// and allow creation of specific kinds in the future (e.g. branch range
+/// extension or interworking).
+///
+/// Let's keep it simple for the moment and not wire this through a GOT.
+///
+template <StubsFlavor Flavor>
+class StubsManager : public TableManager<StubsManager<Flavor>> {
+public:
+  StubsManager() = default;
+
+  /// Name of the object file section that will contain all our stubs.
+  static StringRef getSectionName() { return "__llvm_jitlink_STUBS"; }
+
+  /// Implements link-graph traversal via visitExistingEdges().
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getTarget().isDefined())
+      return false;
+
+    switch (E.getKind()) {
+    case Thumb_Call:
+    case Thumb_Jump24: {
+      DEBUG_WITH_TYPE("jitlink", {
+        dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+               << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
+               << formatv("{0:x}", E.getOffset()) << ")\n";
+      });
+      E.setTarget(this->getEntryForTarget(G, E.getTarget()));
+      return true;
+    }
+    }
+    return false;
+  }
+
+  /// Create a branch range extension stub for the class's flavor.
+  Symbol &createEntry(LinkGraph &G, Symbol &Target);
+
+private:
+  /// Create a new node in the link-graph for the given stub template.
+  template <size_t Size>
+  Block &addStub(LinkGraph &G, const uint8_t (&Code)[Size],
+                 uint64_t Alignment) {
+    ArrayRef<char> Template(reinterpret_cast<const char *>(Code), Size);
+    return G.createContentBlock(getStubsSection(G), Template,
+                                orc::ExecutorAddr(), Alignment, 0);
+  }
+
+  /// Get or create the object file section that will contain all our stubs.
+  Section &getStubsSection(LinkGraph &G) {
+    if (!StubsSection)
+      StubsSection = &G.createSection(getSectionName(),
+                                      orc::MemProt::Read | orc::MemProt::Exec);
+    return *StubsSection;
+  }
+
+  Section *StubsSection = nullptr;
+};
+
+/// Create a branch range extension stub with Thumb encoding for v7 CPUs.
+template <>
+Symbol &StubsManager<Thumbv7>::createEntry(LinkGraph &G, Symbol &Target);
+
+} // namespace aarch32
+} // namespace jitlink
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_AARCH32
diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
index 52ff5e8370031..bc86f45d3c185 100644
--- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMJITLink
   # ELF
   ELF.cpp
   ELFLinkGraphBuilder.cpp
+  ELF_aarch32.cpp
   ELF_aarch64.cpp
   ELF_i386.cpp
   ELF_loongarch.cpp
@@ -33,6 +34,7 @@ add_llvm_component_library(LLVMJITLink
   COFF_x86_64.cpp
 
   # Architectures:
+  aarch32.cpp
   aarch64.cpp
   i386.cpp
   loongarch.cpp
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
index ef0f19a785712..340a0ce134475 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ExecutionEngine/JITLink/ELF.h"
 
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/ELF_aarch32.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_i386.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_loongarch.h"
@@ -69,6 +70,8 @@ createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer) {
   switch (*TargetMachineArch) {
   case ELF::EM_AARCH64:
     return createLinkGraphFromELFObject_aarch64(ObjectBuffer);
+  case ELF::EM_ARM:
+    return createLinkGraphFromELFObject_aarch32(ObjectBuffer);
   case ELF::EM_LOONGARCH:
     return createLinkGraphFromELFObject_loongarch(ObjectBuffer);
   case ELF::EM_RISCV:
@@ -90,6 +93,12 @@ void link_ELF(std::unique_ptr<LinkGraph> G,
   case Triple::aarch64:
     link_ELF_aarch64(std::move(G), std::move(Ctx));
     return;
+  case Triple::arm:
+  case Triple::armeb:
+  case Triple::thumb:
+  case Triple::thumbeb:
+    link_ELF_aarch32(std::move(G), std::move(Ctx));
+    return;
   case Triple::loongarch32:
   case Triple::loongarch64:
     link_ELF_loongarch(std::move(G), std::move(Ctx));
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index 9d2d4958dcf6c..1d98acf868695 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -127,6 +127,12 @@ class ELFLinkGraphBuilder : public ELFLinkGraphBuilderBase {
   Error graphifySections();
   Error graphifySymbols();
 
+  /// Override in derived classes to suppress certain sections in the link
+  /// graph.
+  virtual bool excludeSection(const typename ELFT::Shdr &Sect) const {
+    return false;
+  }
+
   /// Traverse all matching ELFT::Rela relocation records in the given section.
   /// The handler function Func should be callable with this signature:
   ///   Error(const typename ELFT::Rela &,
@@ -321,6 +327,13 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() {
     auto Name = Obj.getSectionName(Sec, SectionStringTab);
     if (!Name)
       return Name.takeError();
+    if (excludeSection(Sec)) {
+      LLVM_DEBUG({
+        dbgs() << "    " << SecIndex << ": Skipping section \"" << *Name
+               << "\" explicitly\n";
+      });
+      continue;
+    }
 
     // Skip null sections.
     if (Sec.sh_type == ELF::SHT_NULL) {
@@ -564,6 +577,10 @@ Error ELFLinkGraphBuilder<ELFT>::forEachRelaRelocation(
     LLVM_DEBUG(dbgs() << "    skipped (dwarf section)\n\n");
     return Error::success();
   }
+  if (excludeSection(**FixupSection)) {
+    LLVM_DEBUG(dbgs() << "    skipped (fixup section excluded explicitly)\n\n");
+    return Error::success();
+  }
 
   // Lookup the link-graph node corresponding to the target section name.
   auto *BlockToFix = getGraphBlock(RelSect.sh_info);
@@ -610,6 +627,10 @@ Error ELFLinkGraphBuilder<ELFT>::forEachRelRelocation(
     LLVM_DEBUG(dbgs() << "    skipped (dwarf section)\n\n");
     return Error::success();
   }
+  if (excludeSection(**FixupSection)) {
+    LLVM_DEBUG(dbgs() << "    skipped (fixup section excluded explicitly)\n\n");
+    return Error::success();
+  }
 
   // Lookup the link-graph node corresponding to the target section name.
   auto *BlockToFix = getGraphBlock(RelSect.sh_info);
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
new file mode 100644
index 0000000000000..0010088fef1e7
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
@@ -0,0 +1,299 @@
+//===----- ELF_aarch32.cpp - JIT linker implementation for arm/thumb ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF/aarch32 jit-link implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/ELF_aarch32.h"
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/JITLink/aarch32.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/TargetParser/ARMTargetParser.h"
+
+#include "ELFLinkGraphBuilder.h"
+#include "JITLinkGeneric.h"
+
+#define DEBUG_TYPE "jitlink"
+
+using namespace llvm::object;
+
+namespace llvm {
+namespace jitlink {
+
+/// Translate from ELF relocation type to JITLink-internal edge kind.
+Expected<aarch32::EdgeKind_aarch32> getJITLinkEdgeKind(uint32_t ELFType) {
+  switch (ELFType) {
+  case ELF::R_ARM_REL32:
+    return aarch32::Data_Delta32;
+  case ELF::R_ARM_CALL:
+    return aarch32::Arm_Call;
+  case ELF::R_ARM_THM_CALL:
+    return aarch32::Thumb_Call;
+  case ELF::R_ARM_THM_JUMP24:
+    return aarch32::Thumb_Jump24;
+  case ELF::R_ARM_THM_MOVW_ABS_NC:
+    return aarch32::Thumb_MovwAbsNC;
+  case ELF::R_ARM_THM_MOVT_ABS:
+    return aarch32::Thumb_MovtAbs;
+  }
+
+  return make_error<JITLinkError>(
+      "Unsupported aarch32 relocation " + formatv("{0:d}: ", ELFType) +
+      object::getELFRelocationTypeName(ELF::EM_ARM, ELFType));
+}
+
+/// Translate from JITLink-internal edge kind back to ELF relocation type.
+Expected<uint32_t> getELFRelocationType(Edge::Kind Kind) {
+  switch (static_cast<aarch32::EdgeKind_aarch32>(Kind)) {
+  case aarch32::Data_Delta32:
+    return ELF::R_ARM_REL32;
+  case aarch32::Arm_Call:
+    return ELF::R_ARM_CALL;
+  case aarch32::Thumb_Call:
+    return ELF::R_ARM_THM_CALL;
+  case aarch32::Thumb_Jump24:
+    return ELF::R_ARM_THM_JUMP24;
+  case aarch32::Thumb_MovwAbsNC:
+    return ELF::R_ARM_THM_MOVW_ABS_NC;
+  case aarch32::Thumb_MovtAbs:
+    return ELF::R_ARM_THM_MOVT_ABS;
+  }
+
+  return make_error<JITLinkError>(formatv("Invalid aarch32 edge {0:d}: ",
+                                          Kind));
+}
+
+/// Get a human-readable name for the given ELF AArch32 edge kind.
+const char *getELFAArch32EdgeKindName(Edge::Kind R) {
+  // No ELF-specific edge kinds yet
+  return aarch32::getEdgeKindName(R);
+}
+
+class ELFJITLinker_aarch32 : public JITLinker<ELFJITLinker_aarch32> {
+  friend class JITLinker<ELFJITLinker_aarch32>;
+
+public:
+  ELFJITLinker_aarch32(std::unique_ptr<JITLinkContext> Ctx,
+                       std::unique_ptr<LinkGraph> G, PassConfiguration PassCfg,
+                       aarch32::ArmConfig ArmCfg)
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassCfg)),
+        ArmCfg(std::move(ArmCfg)) {}
+
+private:
+  aarch32::ArmConfig ArmCfg;
+
+  Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
+    return aarch32::applyFixup(G, B, E, ArmCfg);
+  }
+};
+
+template <support::endianness DataEndianness>
+class ELFLinkGraphBuilder_aarch32
+    : public ELFLinkGraphBuilder<ELFType<DataEndianness, false>> {
+private:
+  using ELFT = ELFType<DataEndianness, false>;
+  using Base = ELFLinkGraphBuilder<ELFT>;
+
+  bool excludeSection(const typename ELFT::Shdr &Sect) const override {
+    // TODO: An .ARM.exidx (Exception Index table) entry is 8-bytes in size and
+    // consists of 2 words. It might be sufficient to process only relocations
+    // in the the second word (offset 4). Please find more details in: Exception
+    // Handling ABI for the Arm® Architecture -> Index table entries
+    if (Sect.sh_type == ELF::SHT_ARM_EXIDX)
+      return true;
+    return false;
+  }
+
+  Error addRelocations() override {
+    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
+    using Self = ELFLinkGraphBuilder_aarch32<DataEndianness>;
+    for (const auto &RelSect : Base::Sections) {
+      if (Error Err = Base::forEachRelRelocation(RelSect, this,
+                                                 &Self::addSingleRelRelocation))
+        return Err;
+    }
+    return Error::success();
+  }
+
+  Error addSingleRelRelocation(const typename ELFT::Rel &Rel,
+                               const typename ELFT::Shdr &FixupSect,
+                               Block &BlockToFix) {
+    uint32_t SymbolIndex = Rel.getSymbol(false);
+    auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec);
+    if (!ObjSymbol)
+      return ObjSymbol.takeError();
+
+    Symbol *GraphSymbol = Base::getGraphSymbol(SymbolIndex);
+    if (!GraphSymbol)
+      return make_error<StringError>(
+          formatv("Could not find symbol at given index, did you add it to "
+                  "JITSymbolTable? index: {0}, shndx: {1} Size of table: {2}",
+                  SymbolIndex, (*ObjSymbol)->st_shndx,
+                  Base::GraphSymbols.size()),
+          inconvertibleErrorCode());
+
+    uint32_t Type = Rel.getType(false);
+    Expected<aarch32::EdgeKind_aarch32> Kind = getJITLinkEdgeKind(Type);
+    if (!Kind)
+      return Kind.takeError();
+
+    auto FixupAddress = orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
+    Edge E(*Kind, Offset, *GraphSymbol, 0);
+
+    Expected<int64_t> Addend =
+        aarch32::readAddend(*Base::G, BlockToFix, E, ArmCfg);
+    if (!Addend)
+      return Addend.takeError();
+
+    E.setAddend(*Addend);
+    LLVM_DEBUG({
+      dbgs() << "    ";
+      printEdge(dbgs(), BlockToFix, E, getELFAArch32EdgeKindName(*Kind));
+      dbgs() << "\n";
+    });
+
+    BlockToFix.addEdge(std::move(E));
+    return Error::success();
+  }
+
+  aarch32::ArmConfig ArmCfg;
+
+protected:
+  TargetFlagsType makeTargetFlags(const typename ELFT::Sym &Sym) override {
+    if (Sym.getValue() & 0x01)
+      return aarch32::ThumbSymbol;
+    return TargetFlagsType{};
+  }
+
+  orc::ExecutorAddrDiff getRawOffset(const typename ELFT::Sym &Sym,
+                                     TargetFlagsType Flags) override {
+    assert((makeTargetFlags(Sym) & Flags) == Flags);
+    static constexpr uint64_t ThumbBit = 0x01;
+    return Sym.getValue() & ~ThumbBit;
+  }
+
+public:
+  ELFLinkGraphBuilder_aarch32(StringRef FileName, const ELFFile<ELFT> &Obj,
+                              Triple TT, aarch32::ArmConfig ArmCfg)
+      : ELFLinkGraphBuilder<ELFT>(Obj, std::move(TT), FileName,
+                                  getELFAArch32EdgeKindName),
+        ArmCfg(std::move(ArmCfg)) {}
+};
+
+template <aarch32::StubsFlavor Flavor>
+Error buildTables_ELF_aarch32(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
+
+  aarch32::StubsManager<Flavor> PLT;
+  visitExistingEdges(G, PLT);
+  return Error::success();
+}
+
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject_aarch32(MemoryBufferRef ObjectBuffer) {
+  LLVM_DEBUG({
+    dbgs() << "Building jitlink graph for new input "
+           << ObjectBuffer.getBufferIdentifier() << "...\n";
+  });
+
+  auto ELFObj = ObjectFile::createELFObjectFile(ObjectBuffer);
+  if (!ELFObj)
+    return ELFObj.takeError();
+
+  // Find out what exact AArch32 instruction set and features we target.
+  auto TT = (*ELFObj)->makeTriple();
+  ARM::ArchKind AK = ARM::parseArch(TT.getArchName());
+  if (AK == ARM::ArchKind::INVALID)
+    return make_error<JITLinkError>(
+        "Failed to build ELF link graph: Invalid ARM ArchKind");
+
+  // Resolve our internal configuration for the target. If at some point the
+  // CPUArch alone becomes too unprecise, we can find more details in the
+  // Tag_CPU_arch_profile.
+  aarch32::ArmConfig ArmCfg;
+  using namespace ARMBuildAttrs;
+  auto Arch = static_cast<CPUArch>(ARM::getArchAttr(AK));
+  switch (Arch) {
+  case v7:
+  case v8_A:
+    ArmCfg = aarch32::getArmConfigForCPUArch(Arch);
+    assert(ArmCfg.Stubs != aarch32::Unsupported &&
+           "Provide a config for each supported CPU");
+    break;
+  default:
+    return make_error<JITLinkError>(
+        "Failed to build ELF link graph: Unsupported CPU arch " +
+        StringRef(aarch32::getCPUArchName(Arch)));
+  }
+
+  // Populate the link-graph.
+  switch (TT.getArch()) {
+  case Triple::arm:
+  case Triple::thumb: {
+    auto &ELFFile = cast<ELFObjectFile<ELF32LE>>(**ELFObj).getELFFile();
+    return ELFLinkGraphBuilder_aarch32<support::little>(
+               (*ELFObj)->getFileName(), ELFFile, TT, ArmCfg)
+        .buildGraph();
+  }
+  case Triple::armeb:
+  case Triple::thumbeb: {
+    auto &ELFFile = cast<ELFObjectFile<ELF32BE>>(**ELFObj).getELFFile();
+    return ELFLinkGraphBuilder_aarch32<support::big>((*ELFObj)->getFileName(),
+                                                     ELFFile, TT, ArmCfg)
+        .buildGraph();
+  }
+  default:
+    return make_error<JITLinkError>(
+        "Failed to build ELF/aarch32 link graph: Invalid target triple " +
+        TT.getTriple());
+  }
+}
+
+void link_ELF_aarch32(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx) {
+  const Triple &TT = G->getTargetTriple();
+
+  using namespace ARMBuildAttrs;
+  ARM::ArchKind AK = ARM::parseArch(TT.getArchName());
+  auto CPU = static_cast<CPUArch>(ARM::getArchAttr(AK));
+  aarch32::ArmConfig ArmCfg = aarch32::getArmConfigForCPUArch(CPU);
+
+  PassConfiguration PassCfg;
+  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+    // Add a mark-live pass.
+    if (auto MarkLive = Ctx->getMarkLivePass(TT))
+      PassCfg.PrePrunePasses.push_back(std::move(MarkLive));
+    else
+      PassCfg.PrePrunePasses.push_back(markAllSymbolsLive);
+
+    switch (ArmCfg.Stubs) {
+    case aarch32::Thumbv7:
+      PassCfg.PostPrunePasses.push_back(
+          buildTables_ELF_aarch32<aarch32::Thumbv7>);
+      break;
+    case aarch32::Unsupported:
+      llvm_unreachable("Check before building graph");
+    }
+  }
+
+  if (auto Err = Ctx->modifyPassConfig(*G, PassCfg))
+    return Ctx->notifyFailed(std::move(Err));
+
+  ELFJITLinker_aarch32::link(std::move(Ctx), std::move(G), std::move(PassCfg),
+                             std::move(ArmCfg));
+}
+
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
new file mode 100644
index 0000000000000..6f49a4578cf7c
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -0,0 +1,514 @@
+//===--------- aarch32.cpp - Generic JITLink arm/thumb utilities ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing arm/thumb objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/aarch32.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/MathExtras.h"
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+namespace aarch32 {
+
+using namespace support;
+using namespace support::endian;
+
+/// Encode 22-bit immediate value for branch instructions without J1J2 range
+/// extension (formats B T4, BL T1 and BLX T2).
+///
+///   00000:Imm11H:Imm11L:0 -> [ 00000:Imm11H, 00000:Imm11L ]
+///                                            J1^ ^J2 will always be 1
+///
+HalfWords encodeImmBT4BlT1BlxT2(int64_t Value) {
+  constexpr uint32_t J1J2 = 0x2800;
+  uint32_t Imm11H = (Value >> 12) & 0x07ff;
+  uint32_t Imm11L = (Value >> 1) & 0x07ff;
+  return HalfWords{Imm11H, Imm11L | J1J2};
+}
+
+/// Decode 22-bit immediate value for branch instructions without J1J2 range
+/// extension (formats B T4, BL T1 and BLX T2).
+///
+///   [ 00000:Imm11H, 00000:Imm11L ] -> 00000:Imm11H:Imm11L:0
+///                   J1^ ^J2 will always be 1
+///
+int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo) {
+  uint32_t Imm11H = Hi & 0x07ff;
+  uint32_t Imm11L = Lo & 0x07ff;
+  return SignExtend64<22>(Imm11H << 12 | Imm11L << 1);
+}
+
+/// Encode 25-bit immediate value for branch instructions with J1J2 range
+/// extension (formats B T4, BL T1 and BLX T2).
+///
+///   S:I1:I2:Imm10:Imm11:0 -> [ 00000:S:Imm10, 00:J1:0:J2:Imm11 ]
+///
+HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value) {
+  uint32_t S = (Value >> 14) & 0x0400;
+  uint32_t J1 = (((~(Value >> 10)) ^ (Value >> 11)) & 0x2000);
+  uint32_t J2 = (((~(Value >> 11)) ^ (Value >> 13)) & 0x0800);
+  uint32_t Imm10 = (Value >> 12) & 0x03ff;
+  uint32_t Imm11 = (Value >> 1) & 0x07ff;
+  return HalfWords{S | Imm10, J1 | J2 | Imm11};
+}
+
+/// Decode 25-bit immediate value for branch instructions with J1J2 range
+/// extension (formats B T4, BL T1 and BLX T2).
+///
+///   [ 00000:S:Imm10, 00:J1:0:J2:Imm11] -> S:I1:I2:Imm10:Imm11:0
+///
+int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo) {
+  uint32_t S = Hi & 0x0400;
+  uint32_t I1 = ~((Lo ^ (Hi << 3)) << 10) & 0x00800000;
+  uint32_t I2 = ~((Lo ^ (Hi << 1)) << 11) & 0x00400000;
+  uint32_t Imm10 = Hi & 0x03ff;
+  uint32_t Imm11 = Lo & 0x07ff;
+  return SignExtend64<25>(S << 14 | I1 | I2 | Imm10 << 12 | Imm11 << 1);
+}
+
+/// Encode 16-bit immediate value for move instruction formats MOVT T1 and
+/// MOVW T3.
+///
+///   Imm4:Imm1:Imm3:Imm8 -> [ 00000:i:000000:Imm4, 0:Imm3:0000:Imm8 ]
+///
+HalfWords encodeImmMovtT1MovwT3(uint16_t Value) {
+  uint32_t Imm4 = (Value >> 12) & 0x0f;
+  uint32_t Imm1 = (Value >> 11) & 0x01;
+  uint32_t Imm3 = (Value >> 8) & 0x07;
+  uint32_t Imm8 = Value & 0xff;
+  return HalfWords{Imm1 << 10 | Imm4, Imm3 << 12 | Imm8};
+}
+
+/// Decode 16-bit immediate value from move instruction formats MOVT T1 and
+/// MOVW T3.
+///
+///   [ 00000:i:000000:Imm4, 0:Imm3:0000:Imm8 ] -> Imm4:Imm1:Imm3:Imm8
+///
+uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo) {
+  uint32_t Imm4 = Hi & 0x0f;
+  uint32_t Imm1 = (Hi >> 10) & 0x01;
+  uint32_t Imm3 = (Lo >> 12) & 0x07;
+  uint32_t Imm8 = Lo & 0xff;
+  uint32_t Imm16 = Imm4 << 12 | Imm1 << 11 | Imm3 << 8 | Imm8;
+  assert(Imm16 <= 0xffff && "Decoded value out-of-range");
+  return Imm16;
+}
+
+/// Encode register ID for instruction formats MOVT T1 and MOVW T3.
+///
+///   Rd4 -> [0000000000000000, 0000:Rd4:00000000]
+///
+HalfWords encodeRegMovtT1MovwT3(int64_t Value) {
+  uint32_t Rd4 = (Value & 0x0f) << 8;
+  return HalfWords{0, Rd4};
+}
+
+/// Decode register ID from instruction formats MOVT T1 and MOVW T3.
+///
+///   [0000000000000000, 0000:Rd4:00000000] -> Rd4
+///
+int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo) {
+  uint32_t Rd4 = (Lo >> 8) & 0x0f;
+  return Rd4;
+}
+
+/// 32-bit Thumb instructions are stored as two little-endian halfwords.
+/// An instruction at address A encodes bytes A+1, A in the first halfword (Hi),
+/// followed by bytes A+3, A+2 in the second halfword (Lo).
+struct WritableThumbRelocation {
+  /// Create a writable reference to a Thumb32 fixup.
+  WritableThumbRelocation(char *FixupPtr)
+      : Hi{*reinterpret_cast<support::ulittle16_t *>(FixupPtr)},
+        Lo{*reinterpret_cast<support::ulittle16_t *>(FixupPtr + 2)} {}
+
+  support::ulittle16_t &Hi; // First halfword
+  support::ulittle16_t &Lo; // Second halfword
+};
+
+struct ThumbRelocation {
+  /// Create a read-only reference to a Thumb32 fixup.
+  ThumbRelocation(const char *FixupPtr)
+      : Hi{*reinterpret_cast<const support::ulittle16_t *>(FixupPtr)},
+        Lo{*reinterpret_cast<const support::ulittle16_t *>(FixupPtr + 2)} {}
+
+  /// Create a read-only Thumb32 fixup from a writeable one.
+  ThumbRelocation(WritableThumbRelocation &Writable)
+      : Hi{Writable.Hi}, Lo(Writable.Lo) {}
+
+  const support::ulittle16_t &Hi; // First halfword
+  const support::ulittle16_t &Lo; // Second halfword
+};
+
+Error makeUnexpectedOpcodeError(const LinkGraph &G, const ThumbRelocation &R,
+                                Edge::Kind Kind) {
+  return make_error<JITLinkError>(
+      formatv("Invalid opcode [ 0x{0:x4}, 0x{1:x4} ] for relocation: {2}", R.Hi,
+              R.Lo, G.getEdgeKindName(Kind)));
+}
+
+template <EdgeKind_aarch32 Kind> bool checkOpcode(const ThumbRelocation &R) {
+  uint16_t Hi = R.Hi & FixupInfo<Kind>::OpcodeMask.Hi;
+  uint16_t Lo = R.Lo & FixupInfo<Kind>::OpcodeMask.Lo;
+  return Hi == FixupInfo<Kind>::Opcode.Hi && Lo == FixupInfo<Kind>::Opcode.Lo;
+}
+
+template <EdgeKind_aarch32 Kind>
+bool checkRegister(const ThumbRelocation &R, HalfWords Reg) {
+  uint16_t Hi = R.Hi & FixupInfo<Kind>::RegMask.Hi;
+  uint16_t Lo = R.Lo & FixupInfo<Kind>::RegMask.Lo;
+  return Hi == Reg.Hi && Lo == Reg.Lo;
+}
+
+template <EdgeKind_aarch32 Kind>
+bool writeRegister(WritableThumbRelocation &R, HalfWords Reg) {
+  static constexpr HalfWords Mask = FixupInfo<Kind>::RegMask;
+  assert((Mask.Hi & Reg.Hi) == Reg.Hi && (Mask.Hi & Reg.Hi) == Reg.Hi &&
+         "Value bits exceed bit range of given mask");
+  R.Hi = (R.Hi & ~Mask.Hi) | Reg.Hi;
+  R.Lo = (R.Lo & ~Mask.Lo) | Reg.Lo;
+}
+
+template <EdgeKind_aarch32 Kind>
+void writeImmediate(WritableThumbRelocation &R, HalfWords Imm) {
+  static constexpr HalfWords Mask = FixupInfo<Kind>::ImmMask;
+  assert((Mask.Hi & Imm.Hi) == Imm.Hi && (Mask.Hi & Imm.Hi) == Imm.Hi &&
+         "Value bits exceed bit range of given mask");
+  R.Hi = (R.Hi & ~Mask.Hi) | Imm.Hi;
+  R.Lo = (R.Lo & ~Mask.Lo) | Imm.Lo;
+}
+
+Expected<int64_t> readAddendData(LinkGraph &G, Block &B, const Edge &E) {
+  endianness Endian = G.getEndianness();
+  assert(Endian != native && "Declare as little or big explicitly");
+
+  Edge::Kind Kind = E.getKind();
+  const char *BlockWorkingMem = B.getContent().data();
+  const char *FixupPtr = BlockWorkingMem + E.getOffset();
+
+  switch (Kind) {
+  case Data_Delta32:
+    return SignExtend64<32>((Endian == little) ? read32<little>(FixupPtr)
+                                               : read32<big>(FixupPtr));
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        " can not read implicit addend for aarch32 edge kind " +
+        G.getEdgeKindName(E.getKind()));
+  }
+}
+
+Expected<int64_t> readAddendArm(LinkGraph &G, Block &B, const Edge &E) {
+  Edge::Kind Kind = E.getKind();
+
+  switch (Kind) {
+  case Arm_Call:
+    return make_error<JITLinkError>(
+        "Addend extraction for relocation type not yet implemented: " +
+        StringRef(G.getEdgeKindName(Kind)));
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        " can not read implicit addend for aarch32 edge kind " +
+        G.getEdgeKindName(E.getKind()));
+  }
+}
+
+Expected<int64_t> readAddendThumb(LinkGraph &G, Block &B, const Edge &E,
+                                  const ArmConfig &ArmCfg) {
+  ThumbRelocation R(B.getContent().data() + E.getOffset());
+  Edge::Kind Kind = E.getKind();
+
+  switch (Kind) {
+  case Thumb_Call:
+    if (!checkOpcode<Thumb_Call>(R))
+      return makeUnexpectedOpcodeError(G, R, Kind);
+    return LLVM_LIKELY(ArmCfg.J1J2BranchEncoding)
+               ? decodeImmBT4BlT1BlxT2_J1J2(R.Hi, R.Lo)
+               : decodeImmBT4BlT1BlxT2(R.Hi, R.Lo);
+
+  case Thumb_Jump24:
+    if (!checkOpcode<Thumb_Jump24>(R))
+      return makeUnexpectedOpcodeError(G, R, Kind);
+    if (R.Lo & FixupInfo<Thumb_Jump24>::LoBitConditional)
+      return make_error<JITLinkError>("Relocation expects an unconditional "
+                                      "B.W branch instruction: " +
+                                      StringRef(G.getEdgeKindName(Kind)));
+    return LLVM_LIKELY(ArmCfg.J1J2BranchEncoding)
+                  ? decodeImmBT4BlT1BlxT2_J1J2(R.Hi, R.Lo)
+                  : decodeImmBT4BlT1BlxT2(R.Hi, R.Lo);
+
+  case Thumb_MovwAbsNC:
+    if (!checkOpcode<Thumb_MovwAbsNC>(R))
+      return makeUnexpectedOpcodeError(G, R, Kind);
+    // Initial addend is interpreted as a signed value
+    return SignExtend64<16>(decodeImmMovtT1MovwT3(R.Hi, R.Lo));
+
+  case Thumb_MovtAbs:
+    if (!checkOpcode<Thumb_MovtAbs>(R))
+      return makeUnexpectedOpcodeError(G, R, Kind);
+    // Initial addend is interpreted as a signed value
+    return SignExtend64<16>(decodeImmMovtT1MovwT3(R.Hi, R.Lo));
+
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        " can not read implicit addend for aarch32 edge kind " +
+        G.getEdgeKindName(E.getKind()));
+  }
+}
+
+Error applyFixupData(LinkGraph &G, Block &B, const Edge &E) {
+  using namespace support;
+
+  char *BlockWorkingMem = B.getAlreadyMutableContent().data();
+  char *FixupPtr = BlockWorkingMem + E.getOffset();
+
+  auto Write32 = [FixupPtr, Endian = G.getEndianness()](int64_t Value) {
+    assert(Endian != native && "Must be explicit: little or big");
+    assert(isInt<32>(Value) && "Must be in signed 32-bit range");
+    uint32_t Imm = static_cast<int32_t>(Value);
+    if (LLVM_LIKELY(Endian == little))
+      endian::write32<little>(FixupPtr, Imm);
+    else
+      endian::write32<big>(FixupPtr, Imm);
+  };
+
+  Edge::Kind Kind = E.getKind();
+  uint64_t FixupAddress = (B.getAddress() + E.getOffset()).getValue();
+  int64_t Addend = E.getAddend();
+  Symbol &TargetSymbol = E.getTarget();
+  uint64_t TargetAddress = TargetSymbol.getAddress().getValue();
+  assert(!TargetSymbol.hasTargetFlags(ThumbSymbol));
+
+  // Regular data relocations have size 4, alignment 1 and write the full 32-bit
+  // result to the place; no need for overflow checking. There are three
+  // exceptions: R_ARM_ABS8, R_ARM_ABS16, R_ARM_PREL31
+  switch (Kind) {
+  case Data_Delta32: {
+    int64_t Value = TargetAddress - FixupAddress + Addend;
+    if (!isInt<32>(Value))
+      return makeTargetOutOfRangeError(G, B, E);
+    Write32(Value);
+    return Error::success();
+  }
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        " encountered unfixable aarch32 edge kind " +
+        G.getEdgeKindName(E.getKind()));
+  }
+}
+
+Error applyFixupArm(LinkGraph &G, Block &B, const Edge &E) {
+  Edge::Kind Kind = E.getKind();
+
+  switch (Kind) {
+  case Arm_Call:
+    return make_error<JITLinkError>(
+        "Fix-up for relocation type not yet implemented: " +
+        StringRef(G.getEdgeKindName(Kind)));
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        " encountered unfixable aarch32 edge kind " +
+        G.getEdgeKindName(E.getKind()));
+  }
+}
+
+Error applyFixupThumb(LinkGraph &G, Block &B, const Edge &E,
+                      const ArmConfig &ArmCfg) {
+  WritableThumbRelocation R(B.getAlreadyMutableContent().data() +
+                            E.getOffset());
+
+  Edge::Kind Kind = E.getKind();
+  uint64_t FixupAddress = (B.getAddress() + E.getOffset()).getValue();
+  int64_t Addend = E.getAddend();
+  Symbol &TargetSymbol = E.getTarget();
+  uint64_t TargetAddress = TargetSymbol.getAddress().getValue();
+  if (TargetSymbol.hasTargetFlags(ThumbSymbol))
+    TargetAddress |= 0x01;
+
+  switch (Kind) {
+  case Thumb_Jump24: {
+    if (!checkOpcode<Thumb_Jump24>(R))
+      return makeUnexpectedOpcodeError(G, R, Kind);
+    if (R.Lo & FixupInfo<Thumb_Jump24>::LoBitConditional)
+      return make_error<JITLinkError>("Relocation expects an unconditional "
+                                      "B.W branch instruction: " +
+                                      StringRef(G.getEdgeKindName(Kind)));
+    if (!(TargetSymbol.hasTargetFlags(ThumbSymbol)))
+      return make_error<JITLinkError>("Branch relocation needs interworking "
+                                      "stub when bridging to ARM: " +
+                                      StringRef(G.getEdgeKindName(Kind)));
+
+    int64_t Value = TargetAddress - FixupAddress + Addend;
+    if (LLVM_LIKELY(ArmCfg.J1J2BranchEncoding)) {
+      if (!isInt<25>(Value))
+        return makeTargetOutOfRangeError(G, B, E);
+      writeImmediate<Thumb_Jump24>(R, encodeImmBT4BlT1BlxT2_J1J2(Value));
+    } else {
+      if (!isInt<22>(Value))
+        return makeTargetOutOfRangeError(G, B, E);
+      writeImmediate<Thumb_Jump24>(R, encodeImmBT4BlT1BlxT2(Value));
+    }
+
+    return Error::success();
+  }
+
+  case Thumb_Call: {
+    if (!checkOpcode<Thumb_Call>(R))
+      return makeUnexpectedOpcodeError(G, R, Kind);
+
+    int64_t Value = TargetAddress - FixupAddress + Addend;
+
+    // The call instruction itself is Thumb. The call destination can either be
+    // Thumb or Arm. We use BL to stay in Thumb and BLX to change to Arm.
+    bool TargetIsArm = !TargetSymbol.hasTargetFlags(ThumbSymbol);
+    bool InstrIsBlx = (R.Lo & FixupInfo<Thumb_Call>::LoBitNoBlx) == 0;
+    if (TargetIsArm != InstrIsBlx) {
+      if (LLVM_LIKELY(TargetIsArm)) {
+        // Change opcode BL -> BLX and fix range value (account for 4-byte
+        // aligned destination while instruction may only be 2-byte aligned
+        // and clear Thumb bit).
+        R.Lo = R.Lo & ~FixupInfo<Thumb_Call>::LoBitNoBlx;
+        R.Lo = R.Lo & ~FixupInfo<Thumb_Call>::LoBitH;
+        Value = alignTo(Value, 4);
+      } else {
+        // Change opcode BLX -> BL and set Thumb bit
+        R.Lo = R.Lo & ~FixupInfo<Thumb_Call>::LoBitNoBlx;
+        Value |= 0x01;
+      }
+    }
+
+    if (LLVM_LIKELY(ArmCfg.J1J2BranchEncoding)) {
+      if (!isInt<25>(Value))
+        return makeTargetOutOfRangeError(G, B, E);
+      writeImmediate<Thumb_Call>(R, encodeImmBT4BlT1BlxT2_J1J2(Value));
+    } else {
+      if (!isInt<22>(Value))
+        return makeTargetOutOfRangeError(G, B, E);
+      writeImmediate<Thumb_Call>(R, encodeImmBT4BlT1BlxT2(Value));
+    }
+
+    assert(((R.Lo & FixupInfo<Thumb_Call>::LoBitNoBlx) ||
+            (R.Lo & FixupInfo<Thumb_Call>::LoBitH) == 0) &&
+           "Opcode BLX implies H bit is clear (avoid UB in BLX T2)");
+    return Error::success();
+  }
+
+  case Thumb_MovwAbsNC: {
+    if (!checkOpcode<Thumb_MovwAbsNC>(R))
+      return makeUnexpectedOpcodeError(G, R, Kind);
+    uint16_t Value = (TargetAddress + Addend) & 0xffff;
+    writeImmediate<Thumb_MovwAbsNC>(R, encodeImmMovtT1MovwT3(Value));
+    return Error::success();
+  }
+
+  case Thumb_MovtAbs: {
+    if (!checkOpcode<Thumb_MovtAbs>(R))
+      return makeUnexpectedOpcodeError(G, R, Kind);
+    uint16_t Value = ((TargetAddress + Addend) >> 16) & 0xffff;
+    writeImmediate<Thumb_MovtAbs>(R, encodeImmMovtT1MovwT3(Value));
+    return Error::success();
+  }
+
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        " encountered unfixable aarch32 edge kind " +
+        G.getEdgeKindName(E.getKind()));
+  }
+}
+
+const uint8_t Thumbv7ABS[] = {
+    0x40, 0xf2, 0x00, 0x0c, // movw r12, #0x0000    ; lower 16-bit
+    0xc0, 0xf2, 0x00, 0x0c, // movt r12, #0x0000    ; upper 16-bit
+    0x60, 0x47              // bx   r12
+};
+
+template <>
+Symbol &StubsManager<Thumbv7>::createEntry(LinkGraph &G, Symbol &Target) {
+  constexpr uint64_t Alignment = 4;
+  Block &B = addStub(G, Thumbv7ABS, Alignment);
+  LLVM_DEBUG({
+    const char *StubPtr = B.getContent().data();
+    HalfWords Reg12 = encodeRegMovtT1MovwT3(12);
+    assert(checkRegister<Thumb_MovwAbsNC>(StubPtr, Reg12) &&
+           checkRegister<Thumb_MovtAbs>(StubPtr + 4, Reg12) &&
+           "Linker generated stubs may only corrupt register r12 (IP)");
+  });
+  B.addEdge(Thumb_MovwAbsNC, 0, Target, 0);
+  B.addEdge(Thumb_MovtAbs, 4, Target, 0);
+  Symbol &Stub = G.addAnonymousSymbol(B, 0, B.getSize(), true, false);
+  Stub.setTargetFlags(ThumbSymbol);
+  return Stub;
+}
+
+const char *getEdgeKindName(Edge::Kind K) {
+#define KIND_NAME_CASE(K)                                                      \
+  case K:                                                                      \
+    return #K;
+
+  switch (K) {
+    KIND_NAME_CASE(Data_Delta32)
+    KIND_NAME_CASE(Arm_Call)
+    KIND_NAME_CASE(Thumb_Call)
+    KIND_NAME_CASE(Thumb_Jump24)
+    KIND_NAME_CASE(Thumb_MovwAbsNC)
+    KIND_NAME_CASE(Thumb_MovtAbs)
+  default:
+    return getGenericEdgeKindName(K);
+  }
+#undef KIND_NAME_CASE
+}
+
+const char *getCPUArchName(ARMBuildAttrs::CPUArch K) {
+#define CPUARCH_NAME_CASE(K)                                                   \
+  case K:                                                                      \
+    return #K;
+
+  using namespace ARMBuildAttrs;
+  switch (K) {
+    CPUARCH_NAME_CASE(Pre_v4)
+    CPUARCH_NAME_CASE(v4)
+    CPUARCH_NAME_CASE(v4T)
+    CPUARCH_NAME_CASE(v5T)
+    CPUARCH_NAME_CASE(v5TE)
+    CPUARCH_NAME_CASE(v5TEJ)
+    CPUARCH_NAME_CASE(v6)
+    CPUARCH_NAME_CASE(v6KZ)
+    CPUARCH_NAME_CASE(v6T2)
+    CPUARCH_NAME_CASE(v6K)
+    CPUARCH_NAME_CASE(v7)
+    CPUARCH_NAME_CASE(v6_M)
+    CPUARCH_NAME_CASE(v6S_M)
+    CPUARCH_NAME_CASE(v7E_M)
+    CPUARCH_NAME_CASE(v8_A)
+    CPUARCH_NAME_CASE(v8_R)
+    CPUARCH_NAME_CASE(v8_M_Base)
+    CPUARCH_NAME_CASE(v8_M_Main)
+    CPUARCH_NAME_CASE(v8_1_M_Main)
+    CPUARCH_NAME_CASE(v9_A)
+  }
+  llvm_unreachable("Missing CPUArch in switch?");
+#undef CPUARCH_NAME_CASE
+}
+
+} // namespace aarch32
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 2c270cd66285d..83a09b8d41e91 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
+#include "llvm/ExecutionEngine/JITLink/aarch32.h"
 #include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h"
 #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h"
@@ -40,7 +41,10 @@ bool hasInitializerSection(jitlink::LinkGraph &G) {
 }
 
 JITTargetAddress getJITSymbolPtrForSymbol(Symbol &Sym) {
-  return Sym.getAddress().getValue();
+  uint64_t CallableAddr = Sym.getAddress().getValue();
+  if (Sym.isCallable() && Sym.hasTargetFlags(aarch32::ThumbSymbol))
+    CallableAddr |= 0x01; // thumb bit
+  return CallableAddr;
 }
 
 JITSymbolFlags getJITSymbolFlagsForSymbol(Symbol &Sym) {
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_thumbv7_printf.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_thumbv7_printf.s
new file mode 100644
index 0000000000000..11a77c95cfa8f
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_thumbv7_printf.s
@@ -0,0 +1,46 @@
+// RUN: llvm-mc -triple=thumbv7-none-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t.o %s
+// RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb -slab-page-size 4096 -abs printf=0x76bbe880 -show-entry-es %t.o | FileCheck %s
+
+// Check that main is a thumb symbol (with LSB set) and printf is arm (with LSB clear)
+//
+// CHECK-LABEL: Symbol table:
+// CHECK-NEXT:    "main":   0x{{[0-9a-f]+[13579bdf]}} [Callable] Ready
+// CHECK-NEXT:    "printf": 0x76bbe880 [Data] Ready
+
+	.globl	main
+	.p2align	2
+	.type	main,%function
+	.code	16
+	.thumb_func
+main:
+	.fnstart
+	.save	{r7, lr}
+	push	{r7, lr}
+	.setfp	r7, sp
+	mov	r7, sp
+	.pad	#8
+	sub	sp, #8
+	movs	r0, #0
+	str	r0, [sp]
+	str	r0, [sp, #4]
+	ldr	r0, .LCPI0_0
+.LPC0_0:
+	add	r0, pc
+	bl	printf
+	ldr	r0, [sp]
+	add	sp, #8
+	pop	{r7, pc}
+
+	.p2align	2
+.LCPI0_0:
+	.long	.L.str-(.LPC0_0+4)
+
+	.size	main, .-main
+	.cantunwind
+	.fnend
+
+	.type	.L.str,%object
+	.section	.rodata.str1.1,"aMS",%progbits,1
+.L.str:
+	.asciz	"Hello AArch32!\n"
+	.size	.L.str, 12
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/lit.local.cfg b/llvm/test/ExecutionEngine/JITLink/AArch32/lit.local.cfg
new file mode 100644
index 0000000000000..20e19aeb06f9d
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+  config.unsupported = True
diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
new file mode 100644
index 0000000000000..0e41174040b68
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
@@ -0,0 +1,200 @@
+//===------- AArch32Tests.cpp - Unit tests for the AArch32 backend --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm/BinaryFormat/ELF.h>
+#include <llvm/ExecutionEngine/JITLink/aarch32.h>
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+using namespace llvm::jitlink::aarch32;
+using namespace llvm::support;
+using namespace llvm::support::endian;
+
+struct MutableHalfWords {
+  MutableHalfWords(HalfWords Preset) : Hi(Preset.Hi), Lo(Preset.Lo) {}
+
+  void patch(HalfWords Value, HalfWords Mask) {
+    Hi = (Hi & ~Mask.Hi) | Value.Hi;
+    Lo = (Lo & ~Mask.Lo) | Value.Lo;
+  }
+
+  uint16_t Hi; // First halfword
+  uint16_t Lo; // Second halfword
+};
+
+namespace llvm {
+namespace jitlink {
+
+Expected<aarch32::EdgeKind_aarch32> getJITLinkEdgeKind(uint32_t ELFType);
+Expected<uint32_t> getELFRelocationType(Edge::Kind Kind);
+
+} // namespace jitlink
+} // namespace llvm
+
+TEST(AArch32_ELF, EdgeKinds) {
+  // Fails: Invalid ELF type -> JITLink kind
+  Expected<uint32_t> ErrKind = getJITLinkEdgeKind(ELF::R_ARM_NONE);
+  EXPECT_TRUE(errorToBool(ErrKind.takeError()));
+
+  // Fails: Invalid JITLink kind -> ELF type
+  Expected<uint32_t> ErrType = getELFRelocationType(Edge::Invalid);
+  EXPECT_TRUE(errorToBool(ErrType.takeError()));
+
+  for (Edge::Kind K = FirstDataRelocation; K < LastThumbRelocation; K += 1) {
+    Expected<uint32_t> ELFType = getELFRelocationType(K);
+    EXPECT_FALSE(errorToBool(ELFType.takeError()))
+        << "Failed to translate JITLink kind -> ELF type";
+
+    Expected<Edge::Kind> JITLinkKind = getJITLinkEdgeKind(*ELFType);
+    EXPECT_FALSE(errorToBool(JITLinkKind.takeError()))
+        << "Failed to translate ELF type -> JITLink kind";
+
+    EXPECT_EQ(*JITLinkKind, K) << "Round-trip value inconsistent?";
+  }
+}
+
+namespace llvm {
+namespace jitlink {
+namespace aarch32 {
+
+HalfWords encodeImmBT4BlT1BlxT2(int64_t Value);
+HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value);
+HalfWords encodeImmMovtT1MovwT3(uint16_t Value);
+HalfWords encodeRegMovtT1MovwT3(int64_t Value);
+
+int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo);
+int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo);
+uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
+int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
+
+} // namespace aarch32
+} // namespace jitlink
+} // namespace llvm
+
+// Big-endian for v7 and v8 (and v6 unless in legacy backwards compatible mode
+// be32) have little-endian instructions and big-endian data. In ELF relocatable
+// objects big-endian instructions may still be encountered. A be8 supporting
+// linker is expected to endian-reverse instructions for the executable.
+template <endianness Endian>
+static HalfWords makeHalfWords(std::array<uint8_t, 4> Mem) {
+  return HalfWords{read16<Endian>(Mem.data()), read16<Endian>(Mem.data() + 2)};
+}
+
+/// 25-bit branch with link (with J1J2 range extension)
+TEST(AArch32_Relocations, Thumb_Call_J1J2) {
+  static_assert(isInt<25>(16777215), "Max value");
+  static_assert(isInt<25>(-16777215), "Min value");
+  static_assert(!isInt<25>(16777217), "First overflow");
+  static_assert(!isInt<25>(-16777217), "First underflow");
+
+  constexpr HalfWords ImmMask = FixupInfo<Thumb_Call>::ImmMask;
+
+  static std::array<HalfWords, 3> MemPresets{
+      makeHalfWords<little>({0xff, 0xf7, 0xfe, 0xef}), // common
+      makeHalfWords<little>({0x00, 0x00, 0x00, 0x00}), // zeros
+      makeHalfWords<little>({0xff, 0xff, 0xff, 0xff}), // ones
+  };
+
+  auto EncodeDecode = [ImmMask](int64_t In, MutableHalfWords &Mem) {
+    Mem.patch(encodeImmBT4BlT1BlxT2_J1J2(In), ImmMask);
+    return decodeImmBT4BlT1BlxT2_J1J2(Mem.Hi, Mem.Lo);
+  };
+
+  for (MutableHalfWords Mem : MemPresets) {
+    HalfWords UnaffectedBits(Mem.Hi & ~ImmMask.Hi, Mem.Lo & ~ImmMask.Lo);
+
+    EXPECT_EQ(EncodeDecode(1, Mem), 0);                 // Zero value
+    EXPECT_EQ(EncodeDecode(0x41, Mem), 0x40);           // Common value
+    EXPECT_EQ(EncodeDecode(16777215, Mem), 16777214);   // Maximum value
+    EXPECT_EQ(EncodeDecode(-16777215, Mem), -16777216); // Minimum value
+    EXPECT_NE(EncodeDecode(16777217, Mem), 16777217);   // First overflow
+    EXPECT_NE(EncodeDecode(-16777217, Mem), -16777217); // First underflow
+
+    EXPECT_TRUE(UnaffectedBits.Hi == (Mem.Hi & ~ImmMask.Hi) &&
+                UnaffectedBits.Lo == (Mem.Lo & ~ImmMask.Lo))
+        << "Diff outside immediate field";
+  }
+}
+
+/// 22-bit branch with link (without J1J2 range extension)
+TEST(AArch32_Relocations, Thumb_Call_Bare) {
+  static_assert(isInt<22>(2097151), "Max value");
+  static_assert(isInt<22>(-2097151), "Min value");
+  static_assert(!isInt<22>(2097153), "First overflow");
+  static_assert(!isInt<22>(-2097153), "First underflow");
+
+  constexpr HalfWords ImmMask = FixupInfo<Thumb_Call>::ImmMask;
+
+  static std::array<HalfWords, 3> MemPresets{
+      makeHalfWords<little>({0xff, 0xf7, 0xfe, 0xef}), // common
+      makeHalfWords<little>({0x00, 0x00, 0x00, 0x00}), // zeros
+      makeHalfWords<little>({0xff, 0xff, 0xff, 0xff}), // ones
+  };
+
+  auto EncodeDecode = [ImmMask](int64_t In, MutableHalfWords &Mem) {
+    Mem.patch(encodeImmBT4BlT1BlxT2_J1J2(In), ImmMask);
+    return decodeImmBT4BlT1BlxT2_J1J2(Mem.Hi, Mem.Lo);
+  };
+
+  for (MutableHalfWords Mem : MemPresets) {
+    HalfWords UnaffectedBits(Mem.Hi & ~ImmMask.Hi, Mem.Lo & ~ImmMask.Lo);
+
+    EXPECT_EQ(EncodeDecode(1, Mem), 0);               // Zero value
+    EXPECT_EQ(EncodeDecode(0x41, Mem), 0x40);         // Common value
+    EXPECT_EQ(EncodeDecode(2097151, Mem), 2097150);   // Maximum value
+    EXPECT_EQ(EncodeDecode(-2097151, Mem), -2097152); // Minimum value
+    EXPECT_NE(EncodeDecode(2097153, Mem), 2097153);   // First overflow
+    EXPECT_NE(EncodeDecode(-2097153, Mem), -2097153); // First underflow
+
+    EXPECT_TRUE(UnaffectedBits.Hi == (Mem.Hi & ~ImmMask.Hi) &&
+                UnaffectedBits.Lo == (Mem.Lo & ~ImmMask.Lo))
+        << "Diff outside immediate field";
+  }
+}
+
+/// Write immediate value to the top halfword of the destination register
+TEST(AArch32_Relocations, Thumb_MovtAbs) {
+  static_assert(isUInt<16>(65535), "Max value");
+  static_assert(!isUInt<16>(65536), "First overflow");
+
+  constexpr HalfWords ImmMask = FixupInfo<Thumb_MovtAbs>::ImmMask;
+  constexpr HalfWords RegMask = FixupInfo<Thumb_MovtAbs>::RegMask;
+
+  static std::array<uint8_t, 3> Registers{0, 5, 12};
+  static std::array<HalfWords, 3> MemPresets{
+      makeHalfWords<little>({0xff, 0xf7, 0xfe, 0xef}), // common
+      makeHalfWords<little>({0x00, 0x00, 0x00, 0x00}), // zeros
+      makeHalfWords<little>({0xff, 0xff, 0xff, 0xff}), // ones
+  };
+
+  auto EncodeDecode = [ImmMask](uint32_t In, MutableHalfWords &Mem) {
+    Mem.patch(encodeImmMovtT1MovwT3(In), ImmMask);
+    return decodeImmMovtT1MovwT3(Mem.Hi, Mem.Lo);
+  };
+
+  for (MutableHalfWords Mem : MemPresets) {
+    for (uint8_t Reg : Registers) {
+      HalfWords UnaffectedBits(Mem.Hi & ~(ImmMask.Hi | RegMask.Hi),
+                               Mem.Lo & ~(ImmMask.Lo | RegMask.Lo));
+
+      Mem.patch(encodeRegMovtT1MovwT3(Reg), RegMask);
+      EXPECT_EQ(EncodeDecode(0x76bb, Mem), 0x76bb);   // Common value
+      EXPECT_EQ(EncodeDecode(0, Mem), 0);             // Minimum value
+      EXPECT_EQ(EncodeDecode(0xffff, Mem), 0xffff);   // Maximum value
+      EXPECT_NE(EncodeDecode(0x10000, Mem), 0x10000); // First overflow
+
+      // Destination register as well as unaffacted bits should be intact
+      EXPECT_EQ(decodeRegMovtT1MovwT3(Mem.Hi, Mem.Lo), Reg);
+      EXPECT_TRUE(UnaffectedBits.Hi == (Mem.Hi & ~(ImmMask.Hi | RegMask.Hi)) &&
+                  UnaffectedBits.Lo == (Mem.Lo & ~(ImmMask.Lo | RegMask.Lo)))
+          << "Diff outside immediate/register field";
+    }
+  }
+}
diff --git a/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
index 1a71a62d3756d..978914c748c63 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
@@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(JITLinkTests
+    AArch32Tests.cpp
     EHFrameSupportTests.cpp
     LinkGraphTests.cpp
   )

From b7677846da66fd003a538f87fd8de948cfcc3d6a Mon Sep 17 00:00:00 2001
From: Carlos Galvez <carlosgalvezp@gmail.com>
Date: Thu, 23 Mar 2023 10:30:01 +0000
Subject: [PATCH 093/208] [clang-tidy][NFC] Fix broken link in Release Notes

---
 clang-tools-extra/docs/ReleaseNotes.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 89419141cebbd..a5f090045615c 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -238,8 +238,8 @@ Changes in existing checks
   string for ``Prefix`` or ``Suffix`` options could result in the style not
   being used.
 
-- Fixed an issue in :doc:`google-avoid-underscore-in-googletest-name
-  <clang-tidy/checks/google/avoid-underscore-in-googletest-name>` when using
+- Fixed an issue in :doc:`google-readability-avoid-underscore-in-googletest-name
+  <clang-tidy/checks/google/readability-avoid-underscore-in-googletest-name>` when using
   ``DISABLED_`` in the test suite name.
 
 Removed checks

From 61f33def1375a68afc5681627a62ce24446e45e2 Mon Sep 17 00:00:00 2001
From: Adam Paszke <apaszke@google.com>
Date: Thu, 23 Mar 2023 10:12:56 +0000
Subject: [PATCH 094/208] [mlir][Vector] Make sure that vector.contract
 preserves extra attributes while parsing

The old implementation parsed the optional attribute dict, only to replace its
contents by using `assign`.

Reviewed By: ftynse

Differential Revision: https://reviews.llvm.org/D146707
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp |  4 ++--
 mlir/test/Dialect/Vector/ops.mlir        | 12 ++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 9796693b4b6cd..21daff60c7e62 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -640,7 +640,7 @@ ParseResult ContractionOp::parse(OpAsmParser &parser, OperationState &result) {
   auto loc = parser.getCurrentLocation();
   DictionaryAttr dictAttr;
   // TODO: Unify linalg op attribute parsing.
-  if (parser.parseAttribute(dictAttr, "_", result.attributes) ||
+  if (parser.parseAttribute(dictAttr) ||
       parser.parseOperand(lhsInfo) || parser.parseComma() ||
       parser.parseOperand(rhsInfo) || parser.parseComma() ||
       parser.parseOperand(accInfo) ||
@@ -653,7 +653,7 @@ ParseResult ContractionOp::parse(OpAsmParser &parser, OperationState &result) {
       parser.resolveOperand(accInfo, resultType, result.operands) ||
       parser.addTypeToList(resultType, result.types))
     return failure();
-  result.attributes.assign(dictAttr.getValue().begin(),
+  result.attributes.append(dictAttr.getValue().begin(),
                            dictAttr.getValue().end());
 
   // Convert array of string into an array of IteratyType enums. This is needed,
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 60e1507293f7e..4013d5daee8cc 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -291,6 +291,18 @@ func.func @contraction_to_scalar(%arg0: vector<10xf32>, %arg1: vector<10xf32>) -
   return %0 : f32
 }
 
+// CHECK-LABEL: @contraction_extra_attrs
+func.func @contraction_extra_attrs(%arg0: vector<10xf32>, %arg1: vector<10xf32>) -> f32 {
+  // CHECK:      %[[C0:.*]] = arith.constant 0.000000e+00 : f32
+  %f0 = arith.constant 0.0: f32
+  // CHECK:      %[[X:.*]] = vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["reduction"], kind = #vector.kind<add>} %{{.*}}, %{{.*}}, %[[C0]] {first_attr = 1 : i32, second_attr = "string"} : vector<10xf32>, vector<10xf32> into f32
+  %0 = vector.contract #contraction_to_scalar_trait %arg0, %arg1, %f0
+    {first_attr = 1 : i32, second_attr = "string"}
+    : vector<10xf32>, vector<10xf32> into f32
+  // CHECK:      return %[[X]] : f32
+  return %0 : f32
+}
+
 #contraction_to_scalar_max_accesses = [
   affine_map<(i) -> (i)>,
   affine_map<(i) -> (i)>,

From 8d16c6809a080947057ae21b9f6165105b4b2ad8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 10 Feb 2023 11:03:55 +0000
Subject: [PATCH 095/208] [RISCV] Increase default vectorizer LMUL to 2

After some discussion and experimentation, we have seen that changing the default number of vector register bits to LMUL=2 strikes a sweet spot.
Whilst we could be clever here and make the vectorizer smarter about dynamically selecting an LMUL that
a) Doesn't affect register pressure
b) Suitable for the microarchitecture
we would need to teach its heuristics about RISC-V register grouping specifics.
Instead this just does the easy, pragmatic thing by changing the default to a safe value that doesn't affect register pressure signifcantly[1], but should increase throughput and unlock more interleaving.

[1] Register spilling when compiling sqlite at various levels of `-riscv-v-register-bit-width-lmul`:

LMUL=1    2573 spills
LMUL=2    2583 spills
LMUL=4    2819 spills
LMUL=8    3256 spills

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D143723
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   2 +-
 .../LoopVectorize/RISCV/defaults.ll           |  70 +-
 .../Transforms/LoopVectorize/RISCV/divrem.ll  | 586 ++++++-----
 .../LoopVectorize/RISCV/illegal-type.ll       |  30 +-
 .../LoopVectorize/RISCV/inloop-reduction.ll   |  28 +-
 .../RISCV/interleaved-accesses.ll             | 130 ++-
 .../Transforms/LoopVectorize/RISCV/lmul.ll    |   2 +-
 .../LoopVectorize/RISCV/low-trip-count.ll     |  18 +-
 .../LoopVectorize/RISCV/mask-index-type.ll    |  40 +-
 .../RISCV/masked_gather_scatter.ll            | 152 +--
 .../LoopVectorize/RISCV/riscv-interleaved.ll  |   4 +-
 .../LoopVectorize/RISCV/riscv-unroll.ll       |  18 +-
 .../LoopVectorize/RISCV/safe-dep-distance.ll  | 107 +-
 .../LoopVectorize/RISCV/scalable-basics.ll    | 358 ++++---
 .../RISCV/scalable-reductions.ll              |  68 +-
 .../LoopVectorize/RISCV/scalable-tailfold.ll  | 185 ++--
 .../LoopVectorize/RISCV/short-trip-count.ll   |  33 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll | 961 +++++++++---------
 .../Transforms/LoopVectorize/RISCV/zvl32b.ll  |  37 +-
 .../RISCV/rvv-min-vector-size.ll              |  28 +-
 20 files changed, 1495 insertions(+), 1362 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index a6a216392de74..2f6b747140e59 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -25,7 +25,7 @@ static cl::opt<unsigned> RVVRegisterWidthLMUL(
     cl::desc(
         "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
         "by autovectorized code. Fractional LMULs are not supported."),
-    cl::init(1), cl::Hidden);
+    cl::init(2), cl::Hidden);
 
 static cl::opt<unsigned> SLPMaxVF(
     "riscv-v-slp-max-vf",
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
index d58ff5051c621..4b93ea30cf252 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
@@ -14,27 +14,30 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) {
 ; CHECK-LABEL: @vector_add(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -49,7 +52,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) {
 ; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -74,32 +77,35 @@ define i64 @vector_add_reduce(ptr noalias nocapture %a) {
 ; CHECK-LABEL: @vector_add_reduce(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5]] = add <vscale x 1 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP7]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -111,7 +117,7 @@ define i64 @vector_add_reduce(ptr noalias nocapture %a) {
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 7d079d13dc710..119f50df5b8e3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -11,27 +11,30 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @vector_udiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = udiv <vscale x 1 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -54,26 +57,26 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:  entry:
 ; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[TMP6:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; FIXED-NEXT:    [[TMP7:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
-; FIXED-NEXT:    store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8
-; FIXED-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; FIXED-NEXT:    [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
+; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
@@ -115,27 +118,30 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @vector_sdiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <vscale x 1 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -158,26 +164,26 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:  entry:
 ; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[TMP6:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
-; FIXED-NEXT:    store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8
-; FIXED-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
+; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
@@ -219,27 +225,30 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @vector_urem(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = urem <vscale x 1 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = urem <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -262,26 +271,26 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:  entry:
 ; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[TMP6:%.*]] = urem <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; FIXED-NEXT:    [[TMP7:%.*]] = urem <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
-; FIXED-NEXT:    store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8
-; FIXED-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[TMP6:%.*]] = urem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; FIXED-NEXT:    [[TMP7:%.*]] = urem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
+; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
@@ -323,27 +332,30 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @vector_srem(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = srem <vscale x 1 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = srem <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -366,26 +378,26 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:  entry:
 ; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[TMP6:%.*]] = srem <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; FIXED-NEXT:    [[TMP7:%.*]] = srem <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
-; FIXED-NEXT:    store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8
-; FIXED-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[TMP6:%.*]] = srem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; FIXED-NEXT:    [[TMP7:%.*]] = srem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
+; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
@@ -427,31 +439,34 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @predicated_udiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <vscale x 1 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = select <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = udiv <vscale x 1 x i64> [[WIDE_LOAD]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <vscale x 1 x i1> [[TMP5]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i64> [[TMP7]], <vscale x 1 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i1> [[TMP7]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -480,34 +495,34 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:  entry:
 ; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT3]], zeroinitializer
-; FIXED-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> <i64 1, i64 1>
-; FIXED-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[BROADCAST_SPLAT3]], <2 x i64> <i64 1, i64 1>
-; FIXED-NEXT:    [[TMP10:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], [[TMP8]]
-; FIXED-NEXT:    [[TMP11:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], [[TMP9]]
-; FIXED-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
-; FIXED-NEXT:    [[TMP13:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
-; FIXED-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP10]], <2 x i64> [[WIDE_LOAD]]
-; FIXED-NEXT:    [[PREDPHI4:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP11]], <2 x i64> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; FIXED-NEXT:    store <2 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT3]], zeroinitializer
+; FIXED-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT3]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP10:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP8]]
+; FIXED-NEXT:    [[TMP11:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]]
+; FIXED-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]]
+; FIXED-NEXT:    [[PREDPHI4:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
@@ -561,31 +576,34 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @predicated_sdiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <vscale x 1 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = select <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = sdiv <vscale x 1 x i64> [[WIDE_LOAD]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <vscale x 1 x i1> [[TMP5]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i64> [[TMP7]], <vscale x 1 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i1> [[TMP7]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -614,34 +632,34 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:  entry:
 ; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT3]], zeroinitializer
-; FIXED-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> <i64 1, i64 1>
-; FIXED-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[BROADCAST_SPLAT3]], <2 x i64> <i64 1, i64 1>
-; FIXED-NEXT:    [[TMP10:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], [[TMP8]]
-; FIXED-NEXT:    [[TMP11:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], [[TMP9]]
-; FIXED-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
-; FIXED-NEXT:    [[TMP13:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
-; FIXED-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP10]], <2 x i64> [[WIDE_LOAD]]
-; FIXED-NEXT:    [[PREDPHI4:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP11]], <2 x i64> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; FIXED-NEXT:    store <2 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT3]], zeroinitializer
+; FIXED-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT3]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP10:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP8]]
+; FIXED-NEXT:    [[TMP11:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]]
+; FIXED-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]]
+; FIXED-NEXT:    [[PREDPHI4:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
@@ -695,28 +713,31 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @predicated_udiv_by_constant(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <vscale x 1 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP6:%.*]] = udiv <vscale x 1 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 27, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 1 x i1> [[TMP5]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i64> [[TMP6]], <vscale x 1 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 42, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 27, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP7]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -749,24 +770,24 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD]], <i64 42, i64 42>
-; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD1]], <i64 42, i64 42>
-; FIXED-NEXT:    [[TMP8:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], <i64 27, i64 27>
-; FIXED-NEXT:    [[TMP9:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], <i64 27, i64 27>
-; FIXED-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
-; FIXED-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
-; FIXED-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[WIDE_LOAD]]
-; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP9]], <2 x i64> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; FIXED-NEXT:    store <2 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], <i64 42, i64 42, i64 42, i64 42>
+; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], <i64 42, i64 42, i64 42, i64 42>
+; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], <i64 27, i64 27, i64 27, i64 27>
+; FIXED-NEXT:    [[TMP9:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], <i64 27, i64 27, i64 27, i64 27>
+; FIXED-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]]
+; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXED:       middle.block:
@@ -820,28 +841,31 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @predicated_sdiv_by_constant(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <vscale x 1 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <vscale x 1 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 27, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 1 x i1> [[TMP5]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i64> [[TMP6]], <vscale x 1 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 42, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 27, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP7]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -874,24 +898,24 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD]], <i64 42, i64 42>
-; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD1]], <i64 42, i64 42>
-; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], <i64 27, i64 27>
-; FIXED-NEXT:    [[TMP9:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], <i64 27, i64 27>
-; FIXED-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
-; FIXED-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
-; FIXED-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[WIDE_LOAD]]
-; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP9]], <2 x i64> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; FIXED-NEXT:    store <2 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], <i64 42, i64 42, i64 42, i64 42>
+; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], <i64 42, i64 42, i64 42, i64 42>
+; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], <i64 27, i64 27, i64 27, i64 27>
+; FIXED-NEXT:    [[TMP9:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], <i64 27, i64 27, i64 27, i64 27>
+; FIXED-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]]
+; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; FIXED:       middle.block:
@@ -945,12 +969,12 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @predicated_sdiv_by_minus_one(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -959,15 +983,15 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 8 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 -128, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 -1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP9:%.*]] = sdiv <vscale x 8 x i8> [[WIDE_LOAD]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 8 x i1> [[TMP7]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> [[TMP9]], <vscale x 8 x i8> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 8 x i8> [[PREDPHI]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -128, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_LOAD]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 16 x i1> [[TMP7]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> [[TMP9]], <vscale x 16 x i8> [[WIDE_LOAD]]
+; CHECK-NEXT:    store <vscale x 16 x i8> [[PREDPHI]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1003,26 +1027,26 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 32
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
-; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD]], <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
-; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD1]], <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
-; FIXED-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; FIXED-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; FIXED-NEXT:    [[TMP10:%.*]] = sdiv <16 x i8> [[WIDE_LOAD]], [[TMP8]]
-; FIXED-NEXT:    [[TMP11:%.*]] = sdiv <16 x i8> [[WIDE_LOAD1]], [[TMP9]]
-; FIXED-NEXT:    [[TMP12:%.*]] = xor <16 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP10]], <16 x i8> [[WIDE_LOAD]]
-; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> [[TMP11]], <16 x i8> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <16 x i8> [[PREDPHI]], ptr [[TMP4]], align 1
-; FIXED-NEXT:    store <16 x i8> [[PREDPHI2]], ptr [[TMP5]], align 1
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 32
+; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD]], <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; FIXED-NEXT:    [[TMP8:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; FIXED-NEXT:    [[TMP9:%.*]] = select <32 x i1> [[TMP7]], <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; FIXED-NEXT:    [[TMP10:%.*]] = sdiv <32 x i8> [[WIDE_LOAD]], [[TMP8]]
+; FIXED-NEXT:    [[TMP11:%.*]] = sdiv <32 x i8> [[WIDE_LOAD1]], [[TMP9]]
+; FIXED-NEXT:    [[TMP12:%.*]] = xor <32 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[TMP13:%.*]] = xor <32 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; FIXED-NEXT:    [[PREDPHI:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP10]], <32 x i8> [[WIDE_LOAD]]
+; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <32 x i1> [[TMP7]], <32 x i8> [[TMP11]], <32 x i8> [[WIDE_LOAD1]]
+; FIXED-NEXT:    store <32 x i8> [[PREDPHI]], ptr [[TMP4]], align 1
+; FIXED-NEXT:    store <32 x i8> [[PREDPHI2]], ptr [[TMP5]], align 1
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; FIXED:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
index 56c8f901a668d..0f7600e9b2235 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
@@ -102,31 +102,31 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
 ; CHECK-LABEL: @uniform_store_i1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[START]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x ptr> poison, ptr [[START]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT2]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x ptr> poison, ptr [[START]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x ptr> [[BROADCAST_SPLATINSERT]], <32 x ptr> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <32 x ptr> poison, ptr [[START]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <32 x ptr> [[BROADCAST_SPLATINSERT2]], <32 x ptr> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56, i64 64, i64 72, i64 80, i64 88, i64 96, i64 104, i64 112, i64 120>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> <i64 128, i64 136, i64 144, i64 152, i64 160, i64 168, i64 176, i64 184, i64 192, i64 200, i64 208, i64 216, i64 224, i64 232, i64 240, i64 248>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, <16 x ptr> [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, <16 x ptr> [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <16 x ptr> [[TMP4]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <16 x ptr> [[TMP5]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 15
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56, i64 64, i64 72, i64 80, i64 88, i64 96, i64 104, i64 112, i64 120, i64 128, i64 136, i64 144, i64 152, i64 160, i64 168, i64 176, i64 184, i64 192, i64 200, i64 208, i64 216, i64 224, i64 232, i64 240, i64 248>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> <i64 256, i64 264, i64 272, i64 280, i64 288, i64 296, i64 304, i64 312, i64 320, i64 328, i64 336, i64 344, i64 352, i64 360, i64 368, i64 376, i64 384, i64 392, i64 400, i64 408, i64 416, i64 424, i64 432, i64 440, i64 448, i64 456, i64 464, i64 472, i64 480, i64 488, i64 496, i64 504>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <32 x ptr> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <32 x ptr> [[TMP5]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <32 x i1> [[TMP7]], i32 31
 ; CHECK-NEXT:    store i1 [[TMP8]], ptr [[DST:%.*]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 256
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 512
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 2eb8ac4086f78..1310ed3618b2c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -13,31 +13,31 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; OUTLOOP-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; OUTLOOP:       for.body.preheader:
 ; OUTLOOP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 2
+; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
 ; OUTLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]
 ; OUTLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; OUTLOOP:       vector.ph:
 ; OUTLOOP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 2
+; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
 ; OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]
 ; OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; OUTLOOP:       vector.body:
 ; OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; OUTLOOP-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
 ; OUTLOOP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]]
 ; OUTLOOP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[TMP6]], align 2
-; OUTLOOP-NEXT:    [[TMP7:%.*]] = sext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i32>
-; OUTLOOP-NEXT:    [[TMP8]] = add <vscale x 2 x i32> [[VEC_PHI]], [[TMP7]]
+; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP6]], align 2
+; OUTLOOP-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; OUTLOOP-NEXT:    [[TMP8]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
 ; OUTLOOP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 2
+; OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
 ; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP10]]
 ; OUTLOOP-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; OUTLOOP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OUTLOOP:       middle.block:
-; OUTLOOP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[TMP8]])
+; OUTLOOP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP8]])
 ; OUTLOOP-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; OUTLOOP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; OUTLOOP:       scalar.ph:
@@ -67,12 +67,12 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; INLOOP:       for.body.preheader:
 ; INLOOP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; INLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; INLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 8
 ; INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]
 ; INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INLOOP:       vector.ph:
 ; INLOOP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; INLOOP-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; INLOOP-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 8
 ; INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]
 ; INLOOP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -82,12 +82,12 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
 ; INLOOP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]]
 ; INLOOP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP6]], align 2
-; INLOOP-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; INLOOP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
+; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP6]], align 2
+; INLOOP-NEXT:    [[TMP7:%.*]] = sext <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; INLOOP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP7]])
 ; INLOOP-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
 ; INLOOP-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vscale.i32()
-; INLOOP-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], 4
+; INLOOP-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], 8
 ; INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP11]]
 ; INLOOP-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; INLOOP-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index d51f7becebeb5..827131ed19117 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -13,18 +13,18 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -125,46 +125,46 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i32> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 2, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i32> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 3, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 4 x i64> [[TMP13]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP16]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -404,28 +404,28 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 4
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
+; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
@@ -479,21 +479,45 @@ exit:
 define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
 ; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
 ; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
index c456e0e1df7e8..9b3b90a7bc3b6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=LMUL1
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=1 | FileCheck %s -check-prefix=LMUL1
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=2 | FileCheck %s -check-prefix=LMUL2
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=4 | FileCheck %s -check-prefix=LMUL4
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=8 | FileCheck %s -check-prefix=LMUL8
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=LMUL2
 
 define void @load_store(ptr %p) {
 ; LMUL1-LABEL: @load_store(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index dc4522756c369..ace267d72dea0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -9,9 +9,9 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
@@ -20,18 +20,18 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP5]], i64 5)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP5]], i64 5)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP7]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = shl <vscale x 8 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = shl <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 8 x i8> [[TMP8]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP11]], ptr [[TMP10]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i8> [[TMP8]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP11]], ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index 3d580a5671783..34b06972dab06 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -12,44 +12,44 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-LABEL: @test(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLENUNK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; VLENUNK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
-; VLENUNK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; VLENUNK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; VLENUNK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; VLENUNK-NEXT:    [[TMP5:%.*]] = add <vscale x 4 x i64> [[TMP4]], zeroinitializer
+; VLENUNK-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; VLENUNK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
 ; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; VLENUNK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
-; VLENUNK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.*]], i64 0
-; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; VLENUNK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
+; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VLENUNK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VLENUNK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; VLENUNK-NEXT:    [[TMP11:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 512, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; VLENUNK-NEXT:    [[TMP11:%.*]] = icmp ult <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 512, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
 ; VLENUNK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]]
 ; VLENUNK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; VLENUNK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP13]], i32 4, <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i32> poison)
-; VLENUNK-NEXT:    [[TMP14:%.*]] = xor <vscale x 2 x i1> [[TMP11]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; VLENUNK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[WIDE_MASKED_LOAD]]
-; VLENUNK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i32> [[PREDPHI]], [[BROADCAST_SPLAT]]
+; VLENUNK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i32> poison)
+; VLENUNK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP11]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; VLENUNK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; VLENUNK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[PREDPHI]], [[BROADCAST_SPLAT]]
 ; VLENUNK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP10]]
 ; VLENUNK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; VLENUNK-NEXT:    store <vscale x 2 x i32> [[TMP15]], ptr [[TMP17]], align 4
+; VLENUNK-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP17]], align 4
 ; VLENUNK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; VLENUNK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
 ; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
-; VLENUNK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; VLENUNK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; VLENUNK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VLENUNK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VLENUNK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index ac56579af2d26..00cabd58de913 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -17,8 +17,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-LABEL: @foo4(
 ; RV32-NEXT:  entry:
 ; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]])
-; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]]
+; RV32-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV32:       vector.memcheck:
 ; RV32-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
@@ -33,37 +34,40 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV32:       vector.ph:
-; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP2]]
+; RV32-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
-; RV32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; RV32-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; RV32-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 16, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; RV32-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP7:%.*]] = mul i64 16, [[TMP6]]
-; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; RV32-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; RV32-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
+; RV32-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; RV32-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; RV32-NEXT:    [[TMP10:%.*]] = mul i64 16, [[TMP9]]
+; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV32-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV32:       vector.body:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 1 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> [[TMP8]], i32 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i32> poison), !alias.scope !0
-; RV32-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 1 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 100, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 1 x i64> [[TMP10]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> [[TMP11]], i32 8, <vscale x 1 x i1> [[TMP9]], <vscale x 1 x double> poison), !alias.scope !3
-; RV32-NEXT:    [[TMP12:%.*]] = sitofp <vscale x 1 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 1 x double>
-; RV32-NEXT:    [[TMP13:%.*]] = fadd <vscale x 1 x double> [[WIDE_MASKED_GATHER6]], [[TMP12]]
-; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 1 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> [[TMP13]], <vscale x 1 x ptr> [[TMP14]], i32 8, <vscale x 1 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
-; RV32-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; RV32-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope !0
+; RV32-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP13:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP13]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> poison), !alias.scope !3
+; RV32-NEXT:    [[TMP15:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
+; RV32-NEXT:    [[TMP16:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP15]]
+; RV32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP16]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> [[TMP12]]), !alias.scope !5, !noalias !7
+; RV32-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; RV32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; RV32:       middle.block:
 ; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]]
 ; RV32-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -73,30 +77,31 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32:       for.body:
 ; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
-; RV32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; RV32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
+; RV32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100
 ; RV32-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; RV32:       if.then:
-; RV32-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; RV32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]]
-; RV32-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
-; RV32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP17]] to double
-; RV32-NEXT:    [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]]
+; RV32-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; RV32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP22]]
+; RV32-NEXT:    [[TMP23:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
+; RV32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP21]] to double
+; RV32-NEXT:    [[ADD:%.*]] = fadd double [[TMP23]], [[CONV]]
 ; RV32-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
 ; RV32-NEXT:    store double [[ADD]], ptr [[ARRAYIDX7]], align 8
 ; RV32-NEXT:    br label [[FOR_INC]]
 ; RV32:       for.inc:
 ; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; RV32-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV32-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV32-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
 ; RV32:       for.end:
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @foo4(
 ; RV64-NEXT:  entry:
 ; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]])
-; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]]
+; RV64-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV64:       vector.memcheck:
 ; RV64-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
@@ -111,37 +116,40 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; RV64-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV64:       vector.ph:
-; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP2]]
+; RV64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
-; RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; RV64-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; RV64-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 16, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; RV64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP7:%.*]] = mul i64 16, [[TMP6]]
-; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; RV64-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
+; RV64-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; RV64-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; RV64-NEXT:    [[TMP10:%.*]] = mul i64 16, [[TMP9]]
+; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV64-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV64:       vector.body:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 1 x i64> [[VEC_IND]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> [[TMP8]], i32 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i32> poison), !alias.scope !0
-; RV64-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 1 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 100, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 1 x i64> [[TMP10]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> [[TMP11]], i32 8, <vscale x 1 x i1> [[TMP9]], <vscale x 1 x double> poison), !alias.scope !3
-; RV64-NEXT:    [[TMP12:%.*]] = sitofp <vscale x 1 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 1 x double>
-; RV64-NEXT:    [[TMP13:%.*]] = fadd <vscale x 1 x double> [[WIDE_MASKED_GATHER6]], [[TMP12]]
-; RV64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 1 x i64> [[VEC_IND]]
-; RV64-NEXT:    call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> [[TMP13]], <vscale x 1 x ptr> [[TMP14]], i32 8, <vscale x 1 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
-; RV64-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; RV64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope !0
+; RV64-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP13:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP13]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> poison), !alias.scope !3
+; RV64-NEXT:    [[TMP15:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
+; RV64-NEXT:    [[TMP16:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP15]]
+; RV64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV64-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP16]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> [[TMP12]]), !alias.scope !5, !noalias !7
+; RV64-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; RV64:       middle.block:
 ; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]]
 ; RV64-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -151,22 +159,22 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64:       for.body:
 ; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; RV64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
+; RV64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100
 ; RV64-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; RV64:       if.then:
-; RV64-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]]
-; RV64-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
-; RV64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP17]] to double
-; RV64-NEXT:    [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]]
+; RV64-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP22]]
+; RV64-NEXT:    [[TMP23:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
+; RV64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP21]] to double
+; RV64-NEXT:    [[ADD:%.*]] = fadd double [[TMP23]], [[CONV]]
 ; RV64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
 ; RV64-NEXT:    store double [[ADD]], ptr [[ARRAYIDX7]], align 8
 ; RV64-NEXT:    br label [[FOR_INC]]
 ; RV64:       for.inc:
 ; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; RV64-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
 ; RV64:       for.end:
 ; RV64-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll
index 5fc46c203167f..7f4eb387a1ece 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll
@@ -5,8 +5,8 @@
 
 ; CHECK-LABEL: foo
 ; CHECK: LV: IC is 2
-; CHECK: %{{.*}} = add <4 x i32> %{{.*}}, <i32 4, i32 4, i32 4, i32 4>
-; CHECK: %{{.*}} = add {{.*}}, 8
+; CHECK: %{{.*}} = add <8 x i32> %{{.*}}, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; CHECK: %{{.*}} = add {{.*}}, 16
 
 ; Function Attrs: nofree norecurse nosync nounwind writeonly
 define dso_local void @foo(i32 signext %n, ptr nocapture %A) local_unnamed_addr #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
index 4e9ec86df7ca0..e8d5dc6211b70 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
@@ -14,10 +14,10 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r
 ; LMUL1-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], -1
 ; LMUL1-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; LMUL1-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; LMUL1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; LMUL1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
 ; LMUL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; LMUL1:       vector.ph:
-; LMUL1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; LMUL1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
 ; LMUL1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; LMUL1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LMUL1:       vector.body:
@@ -25,15 +25,15 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r
 ; LMUL1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
 ; LMUL1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; LMUL1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
 ; LMUL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; LMUL1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
-; LMUL1-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; LMUL1-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
+; LMUL1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; LMUL1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
 ; LMUL1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; LMUL1-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4
-; LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; LMUL1-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; LMUL1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; LMUL1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LMUL1:       middle.block:
@@ -54,7 +54,7 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r
 ; LMUL1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; LMUL1-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; LMUL1-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
-; LMUL1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; LMUL1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; LMUL1:       for.end.loopexit:
 ; LMUL1-NEXT:    br label [[FOR_END]]
 ; LMUL1:       for.end:
@@ -108,7 +108,7 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r
 ; LMUL2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; LMUL2-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; LMUL2-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
-; LMUL2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; LMUL2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; LMUL2:       for.end.loopexit:
 ; LMUL2-NEXT:    br label [[FOR_END]]
 ; LMUL2:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index bc4e60425ac4c..a266ae643c1af 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -11,27 +11,30 @@ define void @test(ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 32
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 200
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <vscale x 1 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP4]], 200
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -47,7 +50,7 @@ define void @test(ptr %p) {
 ; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -82,12 +85,12 @@ define void @test_may_clobber(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 100
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -132,27 +135,30 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; CHECK-LABEL: @trivial_due_max_vscale(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 32
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 8192
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <vscale x 1 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP4]], 8192
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -195,27 +201,30 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; CHECK-LABEL: @no_high_lmul_or_interleave(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 32
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 1024
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <vscale x 1 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 32
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP4]], 1024
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
index 53e00ad0fee2b..f7bc4bd35f377 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
@@ -16,27 +16,30 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; VLENUNK-LABEL: @vector_add(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
-; VLENUNK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLENUNK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; VLENUNK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; VLENUNK-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; VLENUNK-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
-; VLENUNK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; VLENUNK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLENUNK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; VLENUNK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; VLENUNK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; VLENUNK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VLENUNK:       middle.block:
 ; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -58,27 +61,30 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; VLEN128-LABEL: @vector_add(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
-; VLEN128-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; VLEN128-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; VLEN128-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
-; VLEN128-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; VLEN128-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLEN128-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; VLEN128-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; VLEN128-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; VLEN128-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VLEN128:       middle.block:
 ; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -121,27 +127,27 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
 ; VLENUNK-LABEL: @vector_add_i32(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.*]], i64 0
-; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
+; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
 ; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP6]], align 4
-; VLENUNK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; VLENUNK-NEXT:    store <vscale x 2 x i32> [[TMP7]], ptr [[TMP6]], align 4
+; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; VLENUNK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; VLENUNK-NEXT:    store <vscale x 4 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; VLENUNK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VLENUNK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -166,27 +172,27 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
 ; VLEN128-LABEL: @vector_add_i32(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
 ; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.*]], i64 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
+; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
 ; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP6]], align 4
-; VLEN128-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; VLEN128-NEXT:    store <vscale x 2 x i32> [[TMP7]], ptr [[TMP6]], align 4
+; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; VLEN128-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; VLEN128-NEXT:    store <vscale x 4 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; VLEN128-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VLEN128-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -287,27 +293,30 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; VLENUNK-LABEL: @indexed_store(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
-; VLENUNK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLENUNK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; VLENUNK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_LOAD]]
-; VLENUNK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[TMP5]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
-; VLENUNK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; VLENUNK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLENUNK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
+; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; VLENUNK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; VLENUNK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; VLENUNK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VLENUNK:       middle.block:
 ; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -329,27 +338,30 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; VLEN128-LABEL: @indexed_store(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
-; VLEN128-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_LOAD]]
-; VLEN128-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[TMP5]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
-; VLEN128-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; VLEN128-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLEN128-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
+; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; VLEN128-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; VLEN128-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VLEN128:       middle.block:
 ; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -389,34 +401,37 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; VLENUNK-LABEL: @indexed_load(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
-; VLENUNK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLENUNK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; VLENUNK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_LOAD]]
-; VLENUNK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP5]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> poison)
-; VLENUNK-NEXT:    [[TMP6]] = add <vscale x 1 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; VLENUNK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLENUNK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VLENUNK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
+; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; VLENUNK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; VLENUNK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; VLENUNK-NEXT:    [[TMP8]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; VLENUNK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; VLENUNK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VLENUNK:       middle.block:
-; VLENUNK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[TMP6]])
+; VLENUNK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP8]])
 ; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; VLENUNK:       scalar.ph:
 ; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VLENUNK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; VLENUNK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VLENUNK:       for.body:
 ; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -430,40 +445,43 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; VLENUNK:       for.end:
-; VLENUNK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; VLENUNK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; VLENUNK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 ; VLEN128-LABEL: @indexed_load(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
-; VLEN128-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
-; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_LOAD]]
-; VLEN128-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP5]], i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> poison)
-; VLEN128-NEXT:    [[TMP6]] = add <vscale x 1 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; VLEN128-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; VLEN128-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLEN128-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VLEN128-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
+; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; VLEN128-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; VLEN128-NEXT:    [[TMP8]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; VLEN128-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; VLEN128-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VLEN128:       middle.block:
-; VLEN128-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[TMP6]])
+; VLEN128-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP8]])
 ; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; VLEN128:       scalar.ph:
 ; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VLEN128-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; VLEN128-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
 ; VLEN128:       for.body:
 ; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -477,7 +495,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; VLEN128:       for.end:
-; VLEN128-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; VLEN128-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; VLEN128-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:
@@ -503,25 +521,28 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; VLENUNK-LABEL: @splat_int(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
-; VLENUNK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLENUNK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; VLENUNK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VLENUNK-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
-; VLENUNK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; VLENUNK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLENUNK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; VLENUNK-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; VLENUNK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VLENUNK:       middle.block:
 ; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -541,25 +562,28 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; VLEN128-LABEL: @splat_int(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
-; VLEN128-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VLEN128-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
-; VLEN128-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; VLEN128-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLEN128-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; VLEN128-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; VLEN128-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; VLEN128-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VLEN128:       middle.block:
 ; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -595,25 +619,28 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
 ; VLENUNK-LABEL: @splat_ptr(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
-; VLENUNK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[V:%.*]], i64 0
-; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[V:%.*]], i64 0
+; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLENUNK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; VLENUNK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
-; VLENUNK-NEXT:    store <vscale x 1 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
-; VLENUNK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; VLENUNK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLENUNK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
+; VLENUNK-NEXT:    store <vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; VLENUNK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VLENUNK:       middle.block:
 ; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -633,25 +660,28 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
 ; VLEN128-LABEL: @splat_ptr(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
-; VLEN128-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[V:%.*]], i64 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[V:%.*]], i64 0
+; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
-; VLEN128-NEXT:    store <vscale x 1 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
-; VLEN128-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; VLEN128-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLEN128-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
+; VLEN128-NEXT:    store <vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; VLEN128-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; VLEN128-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VLEN128:       middle.block:
 ; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
index 8ed7b6444ec6e..c553977a83626 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
@@ -227,17 +227,17 @@ for.end:
 }
 
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
+; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2)
 define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) {
 ; CHECK-LABEL: @fadd_fast_bfloat
 ; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <8 x bfloat>
-; CHECK: %[[LOAD2:.*]] = load <8 x bfloat>
-; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]]
-; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]]
+; CHECK: %[[LOAD1:.*]] = load <16 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <16 x bfloat>
+; CHECK: %[[FADD1:.*]] = fadd fast <16 x bfloat> %[[LOAD1]]
+; CHECK: %[[FADD2:.*]] = fadd fast <16 x bfloat> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]]
-; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]])
+; CHECK: %[[RDX:.*]] = fadd fast <16 x bfloat> %[[FADD2]], %[[FADD1]]
+; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR8000, <16 x bfloat> %[[RDX]])
 entry:
   br label %for.body
 
@@ -328,17 +328,17 @@ for.end:
 ; MUL
 
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
 define i32 @mul(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @mul
 ; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <4 x i32>
-; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
+; CHECK: %[[LOAD1:.*]] = load <8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <8 x i32>
+; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]]
+; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
 entry:
   br label %for.body
 
@@ -358,21 +358,21 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; Note: This test was added to ensure we always check the legality of reductions (and emit a warning if necessary) before checking for memory dependencies
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
 define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @memory_dependence
 ; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <4 x i32>
-; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[LOAD1:.*]] = load <8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <8 x i32>
+; CHECK: %[[LOAD3:.*]] = load <8 x i32>
+; CHECK: %[[LOAD4:.*]] = load <8 x i32>
+; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]]
+; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]]
+; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
 entry:
   br label %for.body
 
@@ -396,19 +396,19 @@ for.end:
   ret i32 %mul
 }
 
-; CHECK-REMARK: vectorized loop (vectorization width: vscale x 2, interleaved count: 2)
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2)
 define float @fmuladd(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @fmuladd(
 ; CHECK: vector.body:
-; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>
-; CHECK: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x float>
-; CHECK: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>
-; CHECK: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x float>
-; CHECK: [[MULADD1:%.*]] = call reassoc <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> [[WIDE_LOAD]], <vscale x 2 x float> [[WIDE_LOAD3]],
-; CHECK: [[MULADD2:%.*]] = call reassoc <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> [[WIDE_LOAD2]], <vscale x 2 x float> [[WIDE_LOAD4]],
+; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>
+; CHECK: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>
+; CHECK: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>
+; CHECK: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>
+; CHECK: [[MULADD1:%.*]] = call reassoc <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD3]],
+; CHECK: [[MULADD2:%.*]] = call reassoc <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD2]], <vscale x 4 x float> [[WIDE_LOAD4]],
 ; CHECK: middle.block:
-; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <vscale x 2 x float> [[MULADD2]], [[MULADD1]]
-; CHECK: call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float -0.000000e+00, <vscale x 2 x float> [[BIN_RDX]])
+; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <vscale x 4 x float> [[MULADD2]], [[MULADD1]]
+; CHECK: call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[BIN_RDX]])
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index 6d057f378d199..5e231da7e7b57 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -12,27 +12,30 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 1 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[TMP6]], ptr [[TMP5]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -75,27 +78,30 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[TMP6]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -136,34 +142,37 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 1 x i64> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP6]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> poison)
-; CHECK-NEXT:    [[TMP7]] = add <vscale x 1 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> [[TMP7]], <vscale x 1 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP9]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[TMP8]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP10]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -177,7 +186,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:
@@ -205,25 +214,28 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -260,26 +272,29 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
 ; CHECK-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 0d6ef7c00def8..4c994772643ef 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -57,33 +57,22 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP5]], i32 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4, 4
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 7b6e29388c759..0bdcf5b1efd01 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -13,26 +13,29 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; SCALABLE-LABEL: @uniform_load(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 8
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP3]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 8
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
+; SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -58,17 +61,17 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 8
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
-; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
-; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
+; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXEDLEN:       middle.block:
@@ -93,26 +96,29 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; TF-SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-SCALABLE:       vector.ph:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-SCALABLE:       vector.body:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = load i64, ptr [[B:%.*]], align 8
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP4]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]]
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B:%.*]], align 8
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; TF-SCALABLE:       middle.block:
 ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; TF-SCALABLE:       scalar.ph:
@@ -138,12 +144,12 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 8
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; TF-FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; TF-FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
+; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; TF-FIXEDLEN-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; TF-FIXEDLEN:       middle.block:
@@ -183,26 +189,29 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-LABEL: @uniform_load_outside_use(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 8
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP3]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 8
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
+; SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -218,7 +227,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       for.end:
-; SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
 ;
 ; FIXEDLEN-LABEL: @uniform_load_outside_use(
@@ -229,17 +238,17 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 8
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
-; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
-; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
+; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXEDLEN:       middle.block:
@@ -284,12 +293,12 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 8
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; TF-FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; TF-FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
+; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; TF-FIXEDLEN-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; TF-FIXEDLEN:       middle.block:
@@ -331,39 +340,43 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-LABEL: @conditional_uniform_load(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <vscale x 1 x i64> [[TMP2]], zeroinitializer
-; SCALABLE-NEXT:    [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP4]]
-; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP5]]
-; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
-; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 10, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 1 x i1> [[TMP8]], <vscale x 1 x i64> poison)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = xor <vscale x 1 x i1> [[TMP8]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 1 x i1> [[TMP8]], <vscale x 1 x i64> [[WIDE_MASKED_GATHER]], <vscale x 1 x i64> zeroinitializer
-; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; SCALABLE-NEXT:    store <vscale x 1 x i64> [[PREDPHI]], ptr [[TMP11]], align 8
-; SCALABLE-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = xor <vscale x 2 x i1> [[TMP11]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], align 8
+; SCALABLE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -391,33 +404,33 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; FIXEDLEN-NEXT:  entry:
 ; FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXEDLEN:       vector.ph:
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT2]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; FIXEDLEN-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], <i64 10, i64 10>
-; FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i64> [[STEP_ADD]], <i64 10, i64 10>
-; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> [[TMP2]], <2 x i64> poison)
-; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> [[TMP3]], <2 x i64> poison)
-; FIXEDLEN-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
-; FIXEDLEN-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[WIDE_MASKED_GATHER]], <2 x i64> zeroinitializer
-; FIXEDLEN-NEXT:    [[PREDPHI5:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[WIDE_MASKED_GATHER4]], <2 x i64> zeroinitializer
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; FIXEDLEN-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
+; FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], <i64 10, i64 10, i64 10, i64 10>
+; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
+; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT3]], i32 8, <4 x i1> [[TMP3]], <4 x i64> poison)
+; FIXEDLEN-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; FIXEDLEN-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
+; FIXEDLEN-NEXT:    [[PREDPHI5:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[WIDE_MASKED_GATHER4]], <4 x i64> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; FIXEDLEN-NEXT:    store <2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
-; FIXEDLEN-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 2
-; FIXEDLEN-NEXT:    store <2 x i64> [[PREDPHI5]], ptr [[TMP9]], align 8
-; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
+; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
+; FIXEDLEN-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI5]], ptr [[TMP9]], align 8
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
 ; FIXEDLEN-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXEDLEN:       middle.block:
@@ -448,42 +461,46 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-SCALABLE:       vector.ph:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = mul i64 1, [[TMP9]]
+; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B:%.*]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-SCALABLE:       vector.body:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024)
-; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 10, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i1> [[TMP9]], <vscale x 1 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i64> poison)
-; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = xor <vscale x 1 x i1> [[TMP9]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i64> [[WIDE_MASKED_GATHER]], <vscale x 1 x i64> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]]
-; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = or <vscale x 1 x i1> [[TMP10]], [[TMP12]]
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[PREDPHI]], ptr [[TMP15]], i32 8, <vscale x 1 x i1> [[TMP14]])
-; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]]
-; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024)
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
+; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> poison)
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
+; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
+; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = or <vscale x 2 x i1> [[TMP13]], [[TMP15]]
+; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[TMP17]])
+; TF-SCALABLE-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
+; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; TF-SCALABLE-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; TF-SCALABLE:       middle.block:
 ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; TF-SCALABLE:       scalar.ph:
@@ -510,22 +527,22 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-FIXEDLEN-NEXT:  entry:
 ; TF-FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-FIXEDLEN:       vector.ph:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-FIXEDLEN:       vector.body:
 ; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], <i64 10, i64 10>
-; TF-FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> [[TMP1]], <2 x i64> poison)
-; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; TF-FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i64> [[WIDE_MASKED_GATHER]], <2 x i64> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
+; TF-FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP1]], <4 x i64> poison)
+; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; TF-FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; TF-FIXEDLEN-NEXT:    store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; TF-FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
+; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; TF-FIXEDLEN-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-FIXEDLEN:       middle.block:
@@ -578,26 +595,29 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; SCALABLE-LABEL: @uniform_load_unaligned(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 1
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP3]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 1
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
+; SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -623,17 +643,17 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 1
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
-; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
-; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
+; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXEDLEN:       middle.block:
@@ -658,26 +678,29 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-SCALABLE:       vector.ph:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-SCALABLE:       vector.body:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = load i64, ptr [[B:%.*]], align 1
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP4]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]]
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B:%.*]], align 1
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-SCALABLE:       middle.block:
 ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; TF-SCALABLE:       scalar.ph:
@@ -703,12 +726,12 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 1
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; TF-FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; TF-FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
+; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; TF-FIXEDLEN-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-FIXEDLEN:       middle.block:
@@ -748,26 +771,29 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; SCALABLE-LABEL: @uniform_store(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 8
-; SCALABLE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
-; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -789,23 +815,23 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; FIXEDLEN-NEXT:  entry:
 ; FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXEDLEN:       vector.ph:
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 8
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
-; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
-; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
+; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXEDLEN:       middle.block:
@@ -830,26 +856,29 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-SCALABLE:       vector.ph:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-SCALABLE:       vector.body:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 8
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]]
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-SCALABLE:       middle.block:
 ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; TF-SCALABLE:       scalar.ph:
@@ -870,8 +899,8 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-FIXEDLEN-NEXT:  entry:
 ; TF-FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-FIXEDLEN:       vector.ph:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-FIXEDLEN:       vector.body:
 ; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -879,8 +908,8 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 8
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; TF-FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
+; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; TF-FIXEDLEN-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; TF-FIXEDLEN:       middle.block:
@@ -920,35 +949,40 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-LABEL: @uniform_store_of_loop_varying(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[INDEX]], i64 0
-; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP2]]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[DOTSPLAT]], [[TMP4]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; SCALABLE-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 1 x i64> [[TMP5]], i32 [[TMP8]]
-; SCALABLE-NEXT:    store i64 [[TMP9]], ptr [[B:%.*]], align 8
-; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP11]], align 8
-; SCALABLE-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i64 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP4]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP6]]
+; SCALABLE-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; SCALABLE-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vscale.i32()
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], 2
+; SCALABLE-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
+; SCALABLE-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x i64> [[TMP7]], i32 [[TMP12]]
+; SCALABLE-NEXT:    store i64 [[TMP13]], ptr [[B:%.*]], align 8
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP15]], align 8
+; SCALABLE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]]
+; SCALABLE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -970,33 +1004,31 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; FIXEDLEN-NEXT:  entry:
 ; FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXEDLEN:       vector.ph:
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT4]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT6]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> <i1 true, i1 true>)
-; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[STEP_ADD]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> <i1 true, i1 true>)
-; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
-; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP4]], align 8
-; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT7]], ptr [[TMP5]], align 8
-; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
-; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXEDLEN-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; FIXEDLEN-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; FIXEDLEN-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; FIXEDLEN-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; FIXEDLEN-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; FIXEDLEN-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; FIXEDLEN-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; FIXEDLEN-NEXT:    store i64 [[TMP7]], ptr [[B:%.*]], align 8
+; FIXEDLEN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
+; FIXEDLEN-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; FIXEDLEN-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8
+; FIXEDLEN-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP11]], align 8
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXEDLEN-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXEDLEN-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXEDLEN:       middle.block:
 ; FIXEDLEN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
 ; FIXEDLEN-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1019,38 +1051,42 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-SCALABLE:       vector.ph:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = mul i64 1, [[TMP9]]
+; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B:%.*]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-SCALABLE:       vector.body:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024)
-; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VEC_IND]], <vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]]
-; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP10]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
-; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024)
+; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
+; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; TF-SCALABLE:       middle.block:
 ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; TF-SCALABLE:       scalar.ph:
@@ -1071,23 +1107,22 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-FIXEDLEN-NEXT:  entry:
 ; TF-FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-FIXEDLEN:       vector.ph:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-FIXEDLEN:       vector.body:
 ; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> <i1 true, i1 true>)
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
-; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 8
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; TF-FIXEDLEN-NEXT:    store i64 [[TMP3]], ptr [[B:%.*]], align 8
+; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
+; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; TF-FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
+; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; TF-FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; TF-FIXEDLEN-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; TF-FIXEDLEN:       middle.block:
 ; TF-FIXEDLEN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
 ; TF-FIXEDLEN-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1125,39 +1160,43 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-LABEL: @conditional_uniform_store(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <vscale x 1 x i64> [[TMP2]], zeroinitializer
-; SCALABLE-NEXT:    [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP4]]
-; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP6:%.*]] = mul i64 1, [[TMP5]]
-; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
-; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 10, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 1 x i1> [[TMP8]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0
-; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8
-; SCALABLE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 2 x i1> [[TMP11]])
+; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP13]], align 8
+; SCALABLE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1184,33 +1223,33 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; FIXEDLEN-NEXT:  entry:
 ; FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXEDLEN:       vector.ph:
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT4]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT6]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT2]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT4]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT6]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; FIXEDLEN-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], <i64 10, i64 10>
-; FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i64> [[STEP_ADD]], <i64 10, i64 10>
-; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> [[TMP2]])
-; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT7]], i32 8, <2 x i1> [[TMP3]])
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; FIXEDLEN-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
+; FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], <i64 10, i64 10, i64 10, i64 10>
+; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT3]], i32 8, <4 x i1> [[TMP2]])
+; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT5]], <4 x ptr> [[BROADCAST_SPLAT7]], i32 8, <4 x i1> [[TMP3]])
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
-; FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 2
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 8
-; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 8
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
 ; FIXEDLEN-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXEDLEN:       middle.block:
@@ -1240,43 +1279,47 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-SCALABLE:       vector.ph:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = mul i64 1, [[TMP9]]
+; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B:%.*]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-SCALABLE:       vector.body:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024)
-; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 10, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i1> [[TMP9]], <vscale x 1 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], <vscale x 1 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 1 x i1> [[TMP10]])
-; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]]
-; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = xor <vscale x 1 x i1> [[TMP9]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = or <vscale x 1 x i1> [[TMP10]], [[TMP13]]
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP15]], i32 8, <vscale x 1 x i1> [[TMP14]])
-; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]]
-; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024)
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
+; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 2 x i1> [[TMP13]])
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
+; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = or <vscale x 2 x i1> [[TMP13]], [[TMP16]]
+; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[TMP17]])
+; TF-SCALABLE-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
+; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; TF-SCALABLE-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; TF-SCALABLE:       middle.block:
 ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; TF-SCALABLE:       scalar.ph:
@@ -1302,22 +1345,22 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-FIXEDLEN-NEXT:  entry:
 ; TF-FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-FIXEDLEN:       vector.ph:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-FIXEDLEN:       vector.body:
 ; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TF-FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], <i64 10, i64 10>
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <2 x i1> [[TMP1]])
+; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
+; TF-FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]])
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; TF-FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; TF-FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
+; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; TF-FIXEDLEN-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; TF-FIXEDLEN:       middle.block:
@@ -1368,26 +1411,29 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-LABEL: @uniform_store_unaligned(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 1
-; SCALABLE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
-; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1409,23 +1455,23 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; FIXEDLEN-NEXT:  entry:
 ; FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXEDLEN:       vector.ph:
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0
-; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 1
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
-; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
-; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
+; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
+; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; FIXEDLEN:       middle.block:
@@ -1450,26 +1496,29 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-SCALABLE:       vector.ph:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-SCALABLE:       vector.body:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024)
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024)
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 1
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]]
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; TF-SCALABLE:       middle.block:
 ; TF-SCALABLE-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; TF-SCALABLE:       scalar.ph:
@@ -1490,8 +1539,8 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-FIXEDLEN-NEXT:  entry:
 ; TF-FIXEDLEN-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; TF-FIXEDLEN:       vector.ph:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
+; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TF-FIXEDLEN:       vector.body:
 ; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1499,8 +1548,8 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 1
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; TF-FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
+; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; TF-FIXEDLEN-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; TF-FIXEDLEN:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
index 0e2f916ac08b6..fb2167b7f5c33 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
@@ -12,33 +12,23 @@ define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i16> poison, i16 [[V]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT4]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[V:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], <2 x i64> [[STEP_ADD]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_VEC]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i16> [[WIDE_VEC2]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i16> [[STRIDED_VEC3]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP6]], <2 x ptr> [[TMP0]], i32 2, <2 x i1> <i1 true, i1 true>)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP7]], <2 x ptr> [[TMP1]], i32 2, <2 x i1> <i1 true, i1 true>)
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[TMP3]], <4 x ptr> [[TMP0]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -56,6 +46,7 @@ define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) {
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
+
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll
index cb017795077f1..08bac7f788c77 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll
@@ -10,26 +10,10 @@ target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
 
 define void @foo(ptr nocapture writeonly %da) {
-; CHECK-128-LABEL: @foo(
-; CHECK-128-NEXT:  entry:
-; CHECK-128-NEXT:    store i64 0, ptr [[DA:%.*]], align 8
-; CHECK-128-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[DA]], i64 1
-; CHECK-128-NEXT:    store i64 0, ptr [[ARRAYIDX1]], align 8
-; CHECK-128-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[DA]], i64 2
-; CHECK-128-NEXT:    store i64 0, ptr [[ARRAYIDX2]], align 8
-; CHECK-128-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[DA]], i64 3
-; CHECK-128-NEXT:    store i64 0, ptr [[ARRAYIDX3]], align 8
-; CHECK-128-NEXT:    ret void
-;
-; CHECK-256-LABEL: @foo(
-; CHECK-256-NEXT:  entry:
-; CHECK-256-NEXT:    store <4 x i64> zeroinitializer, ptr [[DA:%.*]], align 8
-; CHECK-256-NEXT:    ret void
-;
-; CHECK-512-LABEL: @foo(
-; CHECK-512-NEXT:  entry:
-; CHECK-512-NEXT:    store <4 x i64> zeroinitializer, ptr [[DA:%.*]], align 8
-; CHECK-512-NEXT:    ret void
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr [[DA:%.*]], align 8
+; CHECK-NEXT:    ret void
 ;
 entry:
   store i64 0, ptr %da, align 8
@@ -58,3 +42,7 @@ entry:
   %arrayidx2 = getelementptr inbounds i8, ptr %da, i8 2
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-128: {{.*}}
+; CHECK-256: {{.*}}
+; CHECK-512: {{.*}}

From 48f97e5751372b4a64144605c5e9f7e5e13e382a Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Fri, 24 Feb 2023 14:50:00 +0100
Subject: [PATCH 096/208] [FlowSensitive] Log analysis progress for debugging
 purposes

The goal is to be able to understand how the analysis executes, and what its
incremental and final findings are, by enabling logging and reading the logs.
This should include both framework and analysis-specific information.

Ad-hoc printf-debugging doesn't seem sufficient for my understanding, at least.
Being able to check in logging, turn it on in a production binary, and quickly
find particular analysis steps within complex functions seem important.

This can be enabled programmatically through DataflowAnalysisOptions, or
via the flag -dataflow-log. (Works in unittests, clang-tidy, standalone
tools...)

Important missing pieces here:
 - a logger implementation that produces an interactive report (HTML file)
   which can be navigated via timeline/code/CFG.
   (I think the Logger interface is sufficient for this, but need to prototype).
 - display of the application-specific lattice
 - more useful display for the built-in environment
   (e.g. meaningful & consistent names for values, hiding redundant variables in
   the flow condition, hiding unreachable expressions)

Differential Revision: https://reviews.llvm.org/D144730
---
 .../FlowSensitive/DataflowAnalysisContext.h   |  16 +-
 .../FlowSensitive/DataflowEnvironment.h       |   5 +-
 .../clang/Analysis/FlowSensitive/Logger.h     |  85 ++++++++++
 .../lib/Analysis/FlowSensitive/CMakeLists.txt |   1 +
 .../FlowSensitive/DataflowAnalysisContext.cpp |  28 ++++
 clang/lib/Analysis/FlowSensitive/Logger.cpp   | 108 +++++++++++++
 .../TypeErasedDataflowAnalysis.cpp            |  15 +-
 .../Analysis/FlowSensitive/CMakeLists.txt     |   1 +
 .../Analysis/FlowSensitive/LoggerTest.cpp     | 152 ++++++++++++++++++
 9 files changed, 403 insertions(+), 8 deletions(-)
 create mode 100644 clang/include/clang/Analysis/FlowSensitive/Logger.h
 create mode 100644 clang/lib/Analysis/FlowSensitive/Logger.cpp
 create mode 100644 clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 702aaff9c7e71..a044f477ce1b5 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -34,6 +34,7 @@
 
 namespace clang {
 namespace dataflow {
+class Logger;
 
 /// Skip past nodes that the CFG does not emit. These nodes are invisible to
 /// flow-sensitive analysis, and should be ignored as they will effectively not
@@ -67,6 +68,11 @@ class DataflowAnalysisContext {
     /// fundamentally limited: some constructs, such as recursion, are
     /// explicitly unsupported.
     std::optional<ContextSensitiveOptions> ContextSensitiveOpts;
+
+    /// If provided, analysis details will be recorded here.
+    /// (This is always non-null within an AnalysisContext, the framework
+    /// provides a fallback no-op logger).
+    Logger *Log = nullptr;
   };
 
   /// Constructs a dataflow analysis context.
@@ -76,11 +82,9 @@ class DataflowAnalysisContext {
   ///  `S` must not be null.
   DataflowAnalysisContext(std::unique_ptr<Solver> S,
                           Options Opts = Options{
-                              /*ContextSensitiveOpts=*/std::nullopt})
-      : S(std::move(S)), TrueVal(createAtomicBoolValue()),
-        FalseVal(createAtomicBoolValue()), Opts(Opts) {
-    assert(this->S != nullptr);
-  }
+                              /*ContextSensitiveOpts=*/std::nullopt,
+                              /*Logger=*/nullptr});
+  ~DataflowAnalysisContext();
 
   /// Takes ownership of `Loc` and returns a reference to it.
   ///
@@ -393,6 +397,8 @@ class DataflowAnalysisContext {
 
   // Fields modeled by environments covered by this context.
   llvm::DenseSet<const FieldDecl *> ModeledFields;
+
+  std::unique_ptr<Logger> LogOwner; // If created via flags.
 };
 
 } // namespace dataflow
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index e457430a5e646..678e5b871cc83 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -22,6 +22,7 @@
 #include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
+#include "clang/Analysis/FlowSensitive/Logger.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
 #include "llvm/ADT/DenseMap.h"
@@ -177,10 +178,12 @@ class Environment {
   /// with a symbolic representation of the `this` pointee.
   Environment(DataflowAnalysisContext &DACtx, const DeclContext &DeclCtx);
 
-  const DataflowAnalysisContext::Options &getAnalysisOptions() {
+  const DataflowAnalysisContext::Options &getAnalysisOptions() const {
     return DACtx->getOptions();
   }
 
+  Logger &logger() const { return *DACtx->getOptions().Log; }
+
   /// Creates and returns an environment to use for an inline analysis  of the
   /// callee. Uses the storage location from each argument in the `Call` as the
   /// storage location for the corresponding parameter in the callee.
diff --git a/clang/include/clang/Analysis/FlowSensitive/Logger.h b/clang/include/clang/Analysis/FlowSensitive/Logger.h
new file mode 100644
index 0000000000000..903dfbc30d40d
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/Logger.h
@@ -0,0 +1,85 @@
+//===-- Logger.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_LOGGER_H
+#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_LOGGER_H
+
+#include "clang/Analysis/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+namespace clang::dataflow {
+// Forward declarations so we can use Logger anywhere in the framework.
+class ControlFlowContext;
+class TypeErasedDataflowAnalysis;
+struct TypeErasedDataflowAnalysisState;
+
+/// A logger is notified as the analysis progresses.
+/// It can produce a report of the analysis's findings and how it came to them.
+///
+/// The framework reports key structural events (e.g. traversal of blocks).
+/// The specific analysis can add extra details to be presented in context.
+class Logger {
+public:
+  /// Returns a dummy logger that does nothing.
+  static Logger &null();
+  /// A logger that simply writes messages to the specified ostream in real
+  /// time.
+  static std::unique_ptr<Logger> textual(llvm::raw_ostream &);
+
+  virtual ~Logger() = default;
+
+  /// Called by the framework as we start analyzing a new function or statement.
+  /// Forms a pair with endAnalysis().
+  virtual void beginAnalysis(const ControlFlowContext &,
+                             TypeErasedDataflowAnalysis &) {}
+  virtual void endAnalysis() {}
+
+  // At any time during the analysis, we're computing the state for some target
+  // program point.
+
+  /// Called when we start (re-)processing a block in the CFG.
+  /// The target program point is the entry to the specified block.
+  /// Calls to log() describe transferBranch(), join() etc.
+  virtual void enterBlock(const CFGBlock &) {}
+  /// Called when we start processing an element in the current CFG block.
+  /// The target program point is after the specified element.
+  /// Calls to log() describe the transfer() function.
+  virtual void enterElement(const CFGElement &) {}
+
+  /// Records the analysis state computed for the current program point.
+  virtual void recordState(TypeErasedDataflowAnalysisState &) {}
+  /// Records that the analysis state for the current block is now final.
+  virtual void blockConverged() {}
+
+  /// Called by the framework or user code to report some event.
+  /// The event is associated with the current context (program point).
+  /// The Emit function produces the log message. It may or may not be called,
+  /// depending on if the logger is interested; it should have no side effects.
+  void log(llvm::function_ref<void(llvm::raw_ostream &)> Emit) {
+    if (!ShouldLogText)
+      return;
+    std::string S;
+    llvm::raw_string_ostream OS(S);
+    Emit(OS);
+    logText(S);
+  }
+
+protected:
+  /// ShouldLogText should be false for trivial loggers that ignore logText().
+  /// This allows log() to skip evaluating its Emit function.
+  Logger(bool ShouldLogText = true) : ShouldLogText(ShouldLogText) {}
+
+private:
+  bool ShouldLogText;
+  virtual void logText(llvm::StringRef) {}
+};
+
+} // namespace clang::dataflow
+
+#endif
diff --git a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
index 1a49998c39c20..a3216518f4dba 100644
--- a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
@@ -2,6 +2,7 @@ add_clang_library(clangAnalysisFlowSensitive
   ControlFlowContext.cpp
   DataflowAnalysisContext.cpp
   DataflowEnvironment.cpp
+  Logger.cpp
   Transfer.cpp
   TypeErasedDataflowAnalysis.cpp
   Value.cpp
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index a1b813982502b..57169baccbd4a 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -15,13 +15,20 @@
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/Analysis/FlowSensitive/DebugSupport.h"
+#include "clang/Analysis/FlowSensitive/Logger.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <cassert>
 #include <memory>
 #include <utility>
 
+static llvm::cl::opt<std::string>
+    DataflowLog("dataflow-log", llvm::cl::Hidden, llvm::cl::ValueOptional,
+                llvm::cl::desc("Emit log of dataflow analysis. With no arg, "
+                               "writes textual log to stderr."));
+
 namespace clang {
 namespace dataflow {
 
@@ -375,6 +382,27 @@ DataflowAnalysisContext::getControlFlowContext(const FunctionDecl *F) {
   return nullptr;
 }
 
+DataflowAnalysisContext::DataflowAnalysisContext(std::unique_ptr<Solver> S,
+                                                 Options Opts)
+    : S(std::move(S)), TrueVal(createAtomicBoolValue()),
+      FalseVal(createAtomicBoolValue()), Opts(Opts) {
+  assert(this->S != nullptr);
+  // If the -dataflow-log command-line flag was set, synthesize a logger.
+  // This is ugly but provides a uniform method for ad-hoc debugging dataflow-
+  // based tools.
+  if (Opts.Log == nullptr) {
+    if (DataflowLog.getNumOccurrences() > 0) {
+      LogOwner = Logger::textual(llvm::errs());
+      this->Opts.Log = LogOwner.get();
+      // FIXME: if the flag is given a value, write an HTML log to a file.
+    } else {
+      this->Opts.Log = &Logger::null();
+    }
+  }
+}
+
+DataflowAnalysisContext::~DataflowAnalysisContext() = default;
+
 } // namespace dataflow
 } // namespace clang
 
diff --git a/clang/lib/Analysis/FlowSensitive/Logger.cpp b/clang/lib/Analysis/FlowSensitive/Logger.cpp
new file mode 100644
index 0000000000000..469fea338e451
--- /dev/null
+++ b/clang/lib/Analysis/FlowSensitive/Logger.cpp
@@ -0,0 +1,108 @@
+//===-- Logger.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/Logger.h"
+#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h"
+#include "llvm/Support/WithColor.h"
+
+namespace clang::dataflow {
+
+Logger &Logger::null() {
+  struct NullLogger final : Logger {};
+  static auto *Instance = new NullLogger();
+  return *Instance;
+}
+
+namespace {
+struct TextualLogger final : Logger {
+  llvm::raw_ostream &OS;
+  const CFG *CurrentCFG;
+  const CFGBlock *CurrentBlock;
+  const CFGElement *CurrentElement;
+  unsigned CurrentElementIndex;
+  bool ShowColors;
+  llvm::DenseMap<const CFGBlock *, unsigned> VisitCount;
+  TypeErasedDataflowAnalysis *CurrentAnalysis;
+
+  TextualLogger(llvm::raw_ostream &OS)
+      : OS(OS), ShowColors(llvm::WithColor::defaultAutoDetectFunction()(OS)) {}
+
+  virtual void beginAnalysis(const ControlFlowContext &CFG,
+                             TypeErasedDataflowAnalysis &Analysis) override {
+    {
+      llvm::WithColor Header(OS, llvm::raw_ostream::Colors::RED, /*Bold=*/true);
+      OS << "=== Beginning data flow analysis ===\n";
+    }
+    if (auto *D = CFG.getDecl()) {
+      D->print(OS);
+      OS << "\n";
+      D->dump(OS);
+    }
+    CurrentCFG = &CFG.getCFG();
+    CurrentCFG->print(OS, Analysis.getASTContext().getLangOpts(), ShowColors);
+    CurrentAnalysis = &Analysis;
+  }
+  virtual void endAnalysis() override {
+    llvm::WithColor Header(OS, llvm::raw_ostream::Colors::RED, /*Bold=*/true);
+    unsigned Blocks = 0, Steps = 0;
+    for (const auto &E : VisitCount) {
+      ++Blocks;
+      Steps += E.second;
+    }
+    llvm::errs() << "=== Finished analysis: " << Blocks << " blocks in "
+                 << Steps << " total steps ===\n";
+  }
+  virtual void enterBlock(const CFGBlock &Block) override {
+    unsigned Count = ++VisitCount[&Block];
+    {
+      llvm::WithColor Header(OS, llvm::raw_ostream::Colors::RED, /*Bold=*/true);
+      OS << "=== Entering block B" << Block.getBlockID() << " (iteration "
+         << Count << ") ===\n";
+    }
+    Block.print(OS, CurrentCFG, CurrentAnalysis->getASTContext().getLangOpts(),
+                ShowColors);
+    CurrentBlock = &Block;
+    CurrentElement = nullptr;
+    CurrentElementIndex = 0;
+  }
+  virtual void enterElement(const CFGElement &Element) override {
+    ++CurrentElementIndex;
+    CurrentElement = &Element;
+    {
+      llvm::WithColor Subheader(OS, llvm::raw_ostream::Colors::CYAN,
+                                /*Bold=*/true);
+      OS << "Processing element B" << CurrentBlock->getBlockID() << "."
+         << CurrentElementIndex << ": ";
+      Element.dumpToStream(OS);
+    }
+  }
+  void recordState(TypeErasedDataflowAnalysisState &State) override {
+    {
+      llvm::WithColor Subheader(OS, llvm::raw_ostream::Colors::CYAN,
+                                /*Bold=*/true);
+      OS << "Computed state for B" << CurrentBlock->getBlockID() << "."
+         << CurrentElementIndex << ":\n";
+    }
+    // FIXME: currently the environment dump is verbose and unenlightening.
+    // FIXME: dump the user-defined lattice, too.
+    State.Env.dump(OS);
+    OS << "\n";
+  }
+  void blockConverged() override {
+    OS << "B" << CurrentBlock->getBlockID() << " has converged!\n";
+  }
+  virtual void logText(llvm::StringRef S) override { OS << S << "\n"; }
+};
+} // namespace
+
+std::unique_ptr<Logger> Logger::textual(llvm::raw_ostream &OS) {
+  return std::make_unique<TextualLogger>(OS);
+}
+
+} // namespace clang::dataflow
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index d94b547ca17de..08bcd5e65e379 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -191,7 +191,10 @@ struct AnalysisContext {
                   llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>>
                       BlockStates)
       : CFCtx(CFCtx), Analysis(Analysis), InitEnv(InitEnv),
-        BlockStates(BlockStates) {}
+        Log(InitEnv.logger()), BlockStates(BlockStates) {
+    Log.beginAnalysis(CFCtx, Analysis);
+  }
+  ~AnalysisContext() { Log.endAnalysis(); }
 
   /// Contains the CFG being analyzed.
   const ControlFlowContext &CFCtx;
@@ -199,6 +202,7 @@ struct AnalysisContext {
   TypeErasedDataflowAnalysis &Analysis;
   /// Initial state to start the analysis.
   const Environment &InitEnv;
+  Logger &Log;
   /// Stores the state of a CFG block if it has been evaluated by the analysis.
   /// The indices correspond to the block IDs.
   llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>> BlockStates;
@@ -368,8 +372,11 @@ transferCFGBlock(const CFGBlock &Block, AnalysisContext &AC,
                  std::function<void(const CFGElement &,
                                     const TypeErasedDataflowAnalysisState &)>
                      PostVisitCFG = nullptr) {
+  AC.Log.enterBlock(Block);
   auto State = computeBlockInputState(Block, AC);
+  AC.Log.recordState(State);
   for (const auto &Element : Block) {
+    AC.Log.enterElement(Element);
     // Built-in analysis
     if (AC.Analysis.builtinOptions()) {
       builtinTransfer(Element, State, AC);
@@ -382,6 +389,7 @@ transferCFGBlock(const CFGBlock &Block, AnalysisContext &AC,
     if (PostVisitCFG) {
       PostVisitCFG(Element, State);
     }
+    AC.Log.recordState(State);
   }
   return State;
 }
@@ -462,15 +470,18 @@ runTypeErasedDataflowAnalysis(
         LatticeJoinEffect Effect2 =
             NewBlockState.Env.widen(OldBlockState->Env, Analysis);
         if (Effect1 == LatticeJoinEffect::Unchanged &&
-            Effect2 == LatticeJoinEffect::Unchanged)
+            Effect2 == LatticeJoinEffect::Unchanged) {
           // The state of `Block` didn't change from widening so there's no need
           // to revisit its successors.
+          AC.Log.blockConverged();
           continue;
+        }
       } else if (Analysis.isEqualTypeErased(OldBlockState->Lattice,
                                             NewBlockState.Lattice) &&
                  OldBlockState->Env.equivalentTo(NewBlockState.Env, Analysis)) {
         // The state of `Block` didn't change after transfer so there's no need
         // to revisit its successors.
+        AC.Log.blockConverged();
         continue;
       }
     }
diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
index ed38a515be270..c77aeaca90959 100644
--- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
@@ -9,6 +9,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests
   DataflowAnalysisContextTest.cpp
   DataflowEnvironmentTest.cpp
   DebugSupportTest.cpp
+  LoggerTest.cpp
   MapLatticeTest.cpp
   MatchSwitchTest.cpp
   MultiVarConstantPropagationTest.cpp
diff --git a/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp b/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
new file mode 100644
index 0000000000000..eab37045c393e
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
@@ -0,0 +1,152 @@
+#include "TestingSupport.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
+#include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
+#include "clang/Analysis/FlowSensitive/DataflowLattice.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <optional>
+
+namespace clang::dataflow::test {
+namespace {
+
+struct TestLattice {
+  int Elements = 0;
+  int Branches = 0;
+  int Joins = 0;
+
+  LatticeJoinEffect join(const TestLattice &Other) {
+    if (Joins < 3) {
+      ++Joins;
+      Elements += Other.Elements;
+      Branches += Other.Branches;
+      return LatticeJoinEffect::Changed;
+    }
+    return LatticeJoinEffect::Unchanged;
+  }
+  friend bool operator==(const TestLattice &LHS, const TestLattice &RHS) {
+    return std::tie(LHS.Elements, LHS.Branches, LHS.Joins) ==
+           std::tie(RHS.Elements, RHS.Branches, RHS.Joins);
+  }
+};
+
+class TestAnalysis : public DataflowAnalysis<TestAnalysis, TestLattice> {
+public:
+  using DataflowAnalysis::DataflowAnalysis;
+
+  static TestLattice initialElement() { return TestLattice{}; }
+  void transfer(const CFGElement &, TestLattice &L, Environment &E) {
+    E.logger().log([](llvm::raw_ostream &OS) { OS << "transfer()"; });
+    ++L.Elements;
+  }
+  void transferBranch(bool Branch, const Stmt *S, TestLattice &L,
+                      Environment &E) {
+    E.logger().log([&](llvm::raw_ostream &OS) {
+      OS << "transferBranch(" << Branch << ")";
+    });
+    ++L.Branches;
+  }
+};
+
+class TestLogger : public Logger {
+public:
+  TestLogger(std::string &S) : OS(S) {}
+
+private:
+  llvm::raw_string_ostream OS;
+
+  void beginAnalysis(const ControlFlowContext &,
+                     TypeErasedDataflowAnalysis &) override {
+    logText("beginAnalysis()");
+  }
+  void endAnalysis() override { logText("\nendAnalysis()"); }
+
+  void enterBlock(const CFGBlock &B) override {
+    OS << "\nenterBlock(" << B.BlockID << ")\n";
+  }
+  void enterElement(const CFGElement &E) override {
+    // we don't want the trailing \n
+    std::string S;
+    llvm::raw_string_ostream SS(S);
+    E.dumpToStream(SS);
+
+    OS << "enterElement(" << llvm::StringRef(S).trim() << ")\n";
+  }
+  void recordState(TypeErasedDataflowAnalysisState &S) override {
+    const TestLattice &L = llvm::any_cast<TestLattice>(S.Lattice.Value);
+    OS << "recordState(Elements=" << L.Elements << ", Branches=" << L.Branches
+       << ", Joins=" << L.Joins << ")\n";
+  }
+  /// Records that the analysis state for the current block is now final.
+  void blockConverged() override { logText("blockConverged()"); }
+
+  void logText(llvm::StringRef Text) override { OS << Text << "\n"; }
+};
+
+TEST(LoggerTest, Sequence) {
+  const char *Code = R"cpp(
+int target(bool b, int p, int q) {
+  return b ? p : q;    
+}
+)cpp";
+
+  auto Inputs = AnalysisInputs<TestAnalysis>(
+      Code, ast_matchers::hasName("target"),
+      [](ASTContext &C, Environment &) { return TestAnalysis(C); });
+  std::vector<std::string> Args = {
+      "-fsyntax-only", "-fno-delayed-template-parsing", "-std=c++17"};
+  Inputs.ASTBuildArgs = Args;
+  std::string Log;
+  TestLogger Logger(Log);
+  Inputs.BuiltinOptions.Log = &Logger;
+
+  ASSERT_THAT_ERROR(checkDataflow<TestAnalysis>(std::move(Inputs),
+                                                [](const AnalysisOutputs &) {}),
+                    llvm::Succeeded());
+
+  EXPECT_EQ(Log, R"(beginAnalysis()
+
+enterBlock(4)
+recordState(Elements=0, Branches=0, Joins=0)
+enterElement(b)
+transfer()
+recordState(Elements=1, Branches=0, Joins=0)
+enterElement(b (ImplicitCastExpr, LValueToRValue, _Bool))
+transfer()
+recordState(Elements=2, Branches=0, Joins=0)
+
+enterBlock(3)
+transferBranch(0)
+recordState(Elements=2, Branches=1, Joins=0)
+enterElement(q)
+transfer()
+recordState(Elements=3, Branches=1, Joins=0)
+
+enterBlock(2)
+transferBranch(1)
+recordState(Elements=2, Branches=1, Joins=0)
+enterElement(p)
+transfer()
+recordState(Elements=3, Branches=1, Joins=0)
+
+enterBlock(1)
+recordState(Elements=6, Branches=2, Joins=1)
+enterElement(b ? p : q)
+transfer()
+recordState(Elements=7, Branches=2, Joins=1)
+enterElement(b ? p : q (ImplicitCastExpr, LValueToRValue, int))
+transfer()
+recordState(Elements=8, Branches=2, Joins=1)
+enterElement(return b ? p : q;)
+transfer()
+recordState(Elements=9, Branches=2, Joins=1)
+
+enterBlock(0)
+recordState(Elements=9, Branches=2, Joins=1)
+
+endAnalysis()
+)");
+}
+
+} // namespace
+} // namespace clang::dataflow::test

From 002c4b7b955b1fc8825b4d6b46bb079390bce812 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Fri, 17 Mar 2023 10:33:07 +0100
Subject: [PATCH 097/208] [clangd] Extend CollectMainFileMacros.

Extend the existing MainFileMacros structure:
- record more information (InConditionalDirective) in MacroOccurrence
- collect macro references inside macro body (fix a long-time FIXME)

So that the MainFileMacros preseve enough information, which allows a
just-in-time convertion to interop with include-cleaner::Macro for
include-cleaer features.

See the context in https://reviews.llvm.org/D146017.

Differential Revision: https://reviews.llvm.org/D146279
---
 clang-tools-extra/clangd/CollectMacros.cpp    | 66 +++++++++++++-
 clang-tools-extra/clangd/CollectMacros.h      | 50 ++++-------
 clang-tools-extra/clangd/ParsedAST.cpp        | 15 ++--
 clang-tools-extra/clangd/Preamble.cpp         | 10 ++-
 .../clangd/unittests/CollectMacrosTests.cpp   | 85 ++++++++++++-------
 .../unittests/SemanticHighlightingTests.cpp   |  4 +-
 6 files changed, 149 insertions(+), 81 deletions(-)

diff --git a/clang-tools-extra/clangd/CollectMacros.cpp b/clang-tools-extra/clangd/CollectMacros.cpp
index 687f86e0a77eb..c0ed8b68ea481 100644
--- a/clang-tools-extra/clangd/CollectMacros.cpp
+++ b/clang-tools-extra/clangd/CollectMacros.cpp
@@ -9,12 +9,13 @@
 #include "CollectMacros.h"
 #include "AST.h"
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace clang {
 namespace clangd {
 
 void CollectMainFileMacros::add(const Token &MacroNameTok, const MacroInfo *MI,
-                                bool IsDefinition) {
+                                bool IsDefinition, bool InIfCondition) {
   if (!InMainFile)
     return;
   auto Loc = MacroNameTok.getLocation();
@@ -26,9 +27,49 @@ void CollectMainFileMacros::add(const Token &MacroNameTok, const MacroInfo *MI,
   auto Range = halfOpenToRange(
       SM, CharSourceRange::getCharRange(Loc, MacroNameTok.getEndLoc()));
   if (auto SID = getSymbolID(Name, MI, SM))
-    Out.MacroRefs[SID].push_back({Range, IsDefinition});
+    Out.MacroRefs[SID].push_back({Range, IsDefinition, InIfCondition});
   else
-    Out.UnknownMacros.push_back({Range, IsDefinition});
+    Out.UnknownMacros.push_back({Range, IsDefinition, InIfCondition});
+}
+
+void CollectMainFileMacros::FileChanged(SourceLocation Loc, FileChangeReason,
+                                        SrcMgr::CharacteristicKind, FileID) {
+  InMainFile = isInsideMainFile(Loc, SM);
+}
+void CollectMainFileMacros::MacroExpands(const Token &MacroName,
+                                         const MacroDefinition &MD,
+                                         SourceRange Range,
+                                         const MacroArgs *Args) {
+  add(MacroName, MD.getMacroInfo());
+}
+void CollectMainFileMacros::MacroUndefined(const clang::Token &MacroName,
+                                           const clang::MacroDefinition &MD,
+                                           const clang::MacroDirective *Undef) {
+  add(MacroName, MD.getMacroInfo());
+}
+void CollectMainFileMacros::Ifdef(SourceLocation Loc, const Token &MacroName,
+                                  const MacroDefinition &MD) {
+  add(MacroName, MD.getMacroInfo(), /*IsDefinition=*/false,
+      /*InConditionalDirective=*/true);
+}
+void CollectMainFileMacros::Ifndef(SourceLocation Loc, const Token &MacroName,
+                                   const MacroDefinition &MD) {
+  add(MacroName, MD.getMacroInfo(), /*IsDefinition=*/false,
+      /*InConditionalDirective=*/true);
+}
+void CollectMainFileMacros::Defined(const Token &MacroName,
+                                    const MacroDefinition &MD,
+                                    SourceRange Range) {
+  add(MacroName, MD.getMacroInfo(), /*IsDefinition=*/false,
+      /*InConditionalDirective=*/true);
+}
+void CollectMainFileMacros::SourceRangeSkipped(SourceRange R,
+                                               SourceLocation EndifLoc) {
+  if (!InMainFile)
+    return;
+  Position Begin = sourceLocToPosition(SM, R.getBegin());
+  Position End = sourceLocToPosition(SM, R.getEnd());
+  Out.SkippedRanges.push_back(Range{Begin, End});
 }
 
 class CollectPragmaMarks : public PPCallbacks {
@@ -58,5 +99,24 @@ collectPragmaMarksCallback(const SourceManager &SM,
   return std::make_unique<CollectPragmaMarks>(SM, Out);
 }
 
+void CollectMainFileMacros::MacroDefined(const Token &MacroName,
+                                         const MacroDirective *MD) {
+
+  if (!InMainFile)
+    return;
+  const auto *MI = MD->getMacroInfo();
+  add(MacroName, MD->getMacroInfo(), true);
+  if (MI)
+    for (const auto &Tok : MI->tokens()) {
+      auto *II = Tok.getIdentifierInfo();
+      // Could this token be a reference to a macro? (Not param to this macro).
+      if (!II || !II->hadMacroDefinition() ||
+          llvm::is_contained(MI->params(), II))
+        continue;
+      if (const MacroInfo *MI = PP.getMacroInfo(II))
+        add(Tok, MI);
+    }
+}
+
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/CollectMacros.h b/clang-tools-extra/clangd/CollectMacros.h
index 9d7b478f1c3c7..d5789a2a88912 100644
--- a/clang-tools-extra/clangd/CollectMacros.h
+++ b/clang-tools-extra/clangd/CollectMacros.h
@@ -13,6 +13,7 @@
 #include "SourceCode.h"
 #include "index/SymbolID.h"
 #include "clang/Lex/PPCallbacks.h"
+#include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/DenseMap.h"
 #include <string>
 
@@ -24,6 +25,8 @@ struct MacroOccurrence {
   // SourceManager from preamble is not available when we build the AST.
   Range Rng;
   bool IsDefinition;
+  // True if the occurence is used in a conditional directive, e.g. #ifdef MACRO
+  bool InConditionalDirective;
 };
 
 struct MainFileMacros {
@@ -43,56 +46,37 @@ struct MainFileMacros {
 ///  - collect macros after the preamble of the main file (in ParsedAST.cpp)
 class CollectMainFileMacros : public PPCallbacks {
 public:
-  explicit CollectMainFileMacros(const SourceManager &SM, MainFileMacros &Out)
-      : SM(SM), Out(Out) {}
+  explicit CollectMainFileMacros(const Preprocessor &PP, MainFileMacros &Out)
+      : SM(PP.getSourceManager()), PP(PP), Out(Out) {}
 
   void FileChanged(SourceLocation Loc, FileChangeReason,
-                   SrcMgr::CharacteristicKind, FileID) override {
-    InMainFile = isInsideMainFile(Loc, SM);
-  }
+                   SrcMgr::CharacteristicKind, FileID) override;
 
-  void MacroDefined(const Token &MacroName, const MacroDirective *MD) override {
-    add(MacroName, MD->getMacroInfo(), /*IsDefinition=*/true);
-  }
+  void MacroDefined(const Token &MacroName, const MacroDirective *MD) override;
 
   void MacroExpands(const Token &MacroName, const MacroDefinition &MD,
-                    SourceRange Range, const MacroArgs *Args) override {
-    add(MacroName, MD.getMacroInfo());
-  }
+                    SourceRange Range, const MacroArgs *Args) override;
 
   void MacroUndefined(const clang::Token &MacroName,
                       const clang::MacroDefinition &MD,
-                      const clang::MacroDirective *Undef) override {
-    add(MacroName, MD.getMacroInfo());
-  }
+                      const clang::MacroDirective *Undef) override;
 
+  // FIXME: handle C++23 #elifdef, #elifndef
   void Ifdef(SourceLocation Loc, const Token &MacroName,
-             const MacroDefinition &MD) override {
-    add(MacroName, MD.getMacroInfo());
-  }
-
+             const MacroDefinition &MD) override;
   void Ifndef(SourceLocation Loc, const Token &MacroName,
-              const MacroDefinition &MD) override {
-    add(MacroName, MD.getMacroInfo());
-  }
+              const MacroDefinition &MD) override;
 
   void Defined(const Token &MacroName, const MacroDefinition &MD,
-               SourceRange Range) override {
-    add(MacroName, MD.getMacroInfo());
-  }
-
-  void SourceRangeSkipped(SourceRange R, SourceLocation EndifLoc) override {
-    if (!InMainFile)
-      return;
-    Position Begin = sourceLocToPosition(SM, R.getBegin());
-    Position End = sourceLocToPosition(SM, R.getEnd());
-    Out.SkippedRanges.push_back(Range{Begin, End});
-  }
+               SourceRange Range) override;
+
+  void SourceRangeSkipped(SourceRange R, SourceLocation EndifLoc) override;
 
 private:
   void add(const Token &MacroNameTok, const MacroInfo *MI,
-           bool IsDefinition = false);
+           bool IsDefinition = false, bool InConditionalDirective = false);
   const SourceManager &SM;
+  const Preprocessor &PP;
   bool InMainFile = true;
   MainFileMacros &Out;
 };
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 1671eec133b6e..1501a5c5f3c3b 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -610,11 +610,12 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     Macros = Patch->mainFileMacros();
     Marks = Patch->marks();
   }
-  Clang->getPreprocessor().addPPCallbacks(
-      std::make_unique<CollectMainFileMacros>(Clang->getSourceManager(),
-                                              Macros));
+  auto& PP = Clang->getPreprocessor();
+  PP.addPPCallbacks(
+      std::make_unique<CollectMainFileMacros>(
+          PP, Macros));
 
-  Clang->getPreprocessor().addPPCallbacks(
+  PP.addPPCallbacks(
       collectPragmaMarksCallback(Clang->getSourceManager(), Marks));
 
   // Copy over the includes from the preamble, then combine with the
@@ -626,10 +627,10 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     CanonIncludes.addSystemHeadersMapping(Clang->getLangOpts());
   std::unique_ptr<CommentHandler> IWYUHandler =
       collectIWYUHeaderMaps(&CanonIncludes);
-  Clang->getPreprocessor().addCommentHandler(IWYUHandler.get());
+  PP.addCommentHandler(IWYUHandler.get());
 
   // Collect tokens of the main file.
-  syntax::TokenCollector CollectTokens(Clang->getPreprocessor());
+  syntax::TokenCollector CollectTokens(PP);
 
   // To remain consistent with preamble builds, these callbacks must be called
   // exactly here, after preprocessor is initialized and BeginSourceFile() was
@@ -660,7 +661,7 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
   // XXX: This is messy: clang-tidy checks flush some diagnostics at EOF.
   // However Action->EndSourceFile() would destroy the ASTContext!
   // So just inform the preprocessor of EOF, while keeping everything alive.
-  Clang->getPreprocessor().EndSourceFile();
+  PP.EndSourceFile();
   // UnitDiagsConsumer is local, we can not store it in CompilerInstance that
   // has a longer lifetime.
   Clang->getDiagnostics().setClient(new IgnoreDiagnostics);
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index 3b0af0ab50a62..061c67d65f7d8 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -133,6 +133,7 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
     CanonIncludes.addSystemHeadersMapping(CI.getLangOpts());
     LangOpts = &CI.getLangOpts();
     SourceMgr = &CI.getSourceManager();
+    PP = &CI.getPreprocessor();
     Includes.collect(CI);
     if (Config::current().Diagnostics.UnusedIncludes ==
                 Config::IncludesPolicy::Strict ||
@@ -144,11 +145,11 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
   }
 
   std::unique_ptr<PPCallbacks> createPPCallbacks() override {
-    assert(SourceMgr && LangOpts &&
-           "SourceMgr and LangOpts must be set at this point");
+    assert(SourceMgr && LangOpts && PP &&
+           "SourceMgr, LangOpts and PP must be set at this point");
 
     return std::make_unique<PPChainedCallbacks>(
-        std::make_unique<CollectMainFileMacros>(*SourceMgr, Macros),
+        std::make_unique<CollectMainFileMacros>(*PP, Macros),
         collectPragmaMarksCallback(*SourceMgr, Marks));
   }
 
@@ -215,6 +216,7 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
   std::unique_ptr<CommentHandler> IWYUHandler = nullptr;
   const clang::LangOptions *LangOpts = nullptr;
   const SourceManager *SourceMgr = nullptr;
+  const Preprocessor *PP = nullptr;
   PreambleBuildStats *Stats;
   bool ParseForwardingFunctions;
   std::function<void(CompilerInstance &)> BeforeExecuteCallback;
@@ -382,7 +384,7 @@ scanPreamble(llvm::StringRef Contents, const tooling::CompileCommand &Cmd) {
   PP.addPPCallbacks(
       std::make_unique<DirectiveCollector>(PP, SP.TextualDirectives));
   PP.addPPCallbacks(collectPragmaMarksCallback(SM, SP.Marks));
-  PP.addPPCallbacks(std::make_unique<CollectMainFileMacros>(SM, SP.Macros));
+  PP.addPPCallbacks(std::make_unique<CollectMainFileMacros>(PP, SP.Macros));
   if (llvm::Error Err = Action.Execute())
     return std::move(Err);
   Action.EndSourceFile();
diff --git a/clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp b/clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp
index 196ed5cea4693..163a7f1a31707 100644
--- a/clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp
@@ -8,12 +8,14 @@
 #include "AST.h"
 #include "Annotations.h"
 #include "CollectMacros.h"
+#include "Matchers.h"
 #include "SourceCode.h"
 #include "TestTU.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include <vector>
 
 namespace clang {
 namespace clangd {
@@ -21,19 +23,24 @@ namespace {
 
 using testing::UnorderedElementsAreArray;
 
+MATCHER_P(rangeIs, R, "") { return arg.Rng == R; }
+MATCHER(isDef, "") { return arg.IsDefinition; }
+MATCHER(inConditionalDirective, "") { return arg.InConditionalDirective; }
+
 TEST(CollectMainFileMacros, SelectedMacros) {
   // References of the same symbol must have the ranges with the same
   // name(integer). If there are N different symbols then they must be named
   // from 1 to N. Macros for which SymbolID cannot be computed must be named
-  // "Unknown".
+  // "Unknown". The payload of the annotation describes the extra bit
+  // information of the MacroOccurrence (e.g. $1(def) => IsDefinition).
   const char *Tests[] = {
       R"cpp(// Macros: Cursor on definition.
-        #define $1[[FOO]](x,y) (x + y)
+        #define $1(def)[[FOO]](x,y) (x + y)
         int main() { int x = $1[[FOO]]($1[[FOO]](3, 4), $1[[FOO]](5, 6)); }
       )cpp",
       R"cpp(
-        #define $1[[M]](X) X;
-        #define $2[[abc]] 123
+        #define $1(def)[[M]](X) X;
+        #define $2(def)[[abc]] 123
         int s = $1[[M]]($2[[abc]]);
       )cpp",
       // FIXME: Locating macro in duplicate definitions doesn't work. Enable
@@ -48,31 +55,50 @@ TEST(CollectMainFileMacros, SelectedMacros) {
       //   #undef $2[[abc]]
       // )cpp",
       R"cpp(
-        #ifdef $Unknown[[UNDEFINED]]
+        #ifdef $Unknown(condit)[[UNDEFINED]]
+        #endif
+
+        #ifndef $Unknown(condit)[[UNDEFINED]]
+        #endif
+
+        #if defined($Unknown(condit)[[UNDEFINED]])
         #endif
       )cpp",
       R"cpp(
-        #ifndef $Unknown[[abc]]
-        #define $1[[abc]]
-        #ifdef $1[[abc]]
+        #ifndef $Unknown(condit)[[abc]]
+        #define $1(def)[[abc]]
+        #ifdef $1(condit)[[abc]]
         #endif
         #endif
       )cpp",
       R"cpp(
         // Macros from token concatenations not included.
-        #define $1[[CONCAT]](X) X##A()
-        #define $2[[PREPEND]](X) MACRO##X()
-        #define $3[[MACROA]]() 123
+        #define $1(def)[[CONCAT]](X) X##A()
+        #define $2(def)[[PREPEND]](X) MACRO##X()
+        #define $3(def)[[MACROA]]() 123
         int B = $1[[CONCAT]](MACRO);
         int D = $2[[PREPEND]](A);
       )cpp",
       R"cpp(
-        // FIXME: Macro names in a definition are not detected.
-        #define $1[[MACRO_ARGS2]](X, Y) X Y
-        #define $2[[FOO]] BAR
-        #define $3[[BAR]] 1
+        #define $1(def)[[MACRO_ARGS2]](X, Y) X Y
+        #define $3(def)[[BAR]] 1
+        #define $2(def)[[FOO]] $3[[BAR]]
         int A = $2[[FOO]];
       )cpp"};
+  auto ExpectedResults = [](const Annotations &T, StringRef Name) {
+    std::vector<Matcher<MacroOccurrence>> ExpectedLocations;
+    for (const auto &[R, Bits] : T.rangesWithPayload(Name)) {
+      if (Bits == "def")
+        ExpectedLocations.push_back(testing::AllOf(rangeIs(R), isDef()));
+      else if (Bits == "condit")
+        ExpectedLocations.push_back(
+            testing::AllOf(rangeIs(R), inConditionalDirective()));
+      else
+        ExpectedLocations.push_back(testing::AllOf(rangeIs(R)));
+    }
+    return ExpectedLocations;
+  };
+
   for (const char *Test : Tests) {
     Annotations T(Test);
     auto AST = TestTU::withCode(T.code()).build();
@@ -80,13 +106,16 @@ TEST(CollectMainFileMacros, SelectedMacros) {
     auto &SM = AST.getSourceManager();
     auto &PP = AST.getPreprocessor();
 
-    // Known macros.
-    for (int I = 1;; I++) {
-      const auto ExpectedRefs = T.ranges(llvm::to_string(I));
-      if (ExpectedRefs.empty())
-        break;
+    for (const auto &[Name, Ranges] : T.all_ranges()) {
+      if (Name == "Unknown") {
+        EXPECT_THAT(ActualMacroRefs.UnknownMacros,
+                    UnorderedElementsAreArray(ExpectedResults(T, "Unknown")))
+            << "Unknown macros doesn't match in " << Test;
+        continue;
+      }
 
-      auto Loc = sourceLocationInMainFile(SM, ExpectedRefs.begin()->start);
+      auto Loc = sourceLocationInMainFile(
+          SM, offsetToPosition(T.code(), Ranges.front().Begin));
       ASSERT_TRUE(bool(Loc));
       const auto *Id = syntax::spelledIdentifierTouching(*Loc, AST.getTokens());
       ASSERT_TRUE(Id);
@@ -94,19 +123,11 @@ TEST(CollectMainFileMacros, SelectedMacros) {
       assert(Macro);
       auto SID = getSymbolID(Macro->Name, Macro->Info, SM);
 
-      std::vector<Range> Ranges;
-      for (const auto &Ref : ActualMacroRefs.MacroRefs[SID])
-        Ranges.push_back(Ref.Rng);
-      EXPECT_THAT(ExpectedRefs, UnorderedElementsAreArray(Ranges))
-          << "Annotation=" << I << ", MacroName=" << Macro->Name
+      EXPECT_THAT(ActualMacroRefs.MacroRefs[SID],
+                  UnorderedElementsAreArray(ExpectedResults(T, Name)))
+          << "Annotation=" << Name << ", MacroName=" << Macro->Name
           << ", Test = " << Test;
     }
-    // Unknown macros.
-    std::vector<Range> Ranges;
-    for (const auto &Ref : AST.getMacros().UnknownMacros)
-      Ranges.push_back(Ref.Rng);
-    EXPECT_THAT(Ranges, UnorderedElementsAreArray(T.ranges("Unknown")))
-        << "Unknown macros doesn't match in " << Test;
   }
 }
 } // namespace
diff --git a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
index 259efcf54a6b2..975378118b7ad 100644
--- a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
@@ -399,7 +399,7 @@ TEST(SemanticHighlighting, GetsCorrectTokens) {
       #define $Macro_decl[[MACRO_CONCAT]](X, V, T) T foo##X = V
       #define $Macro_decl[[DEF_VAR]](X, V) int X = V
       #define $Macro_decl[[DEF_VAR_T]](T, X, V) T X = V
-      #define $Macro_decl[[DEF_VAR_REV]](V, X) DEF_VAR(X, V)
+      #define $Macro_decl[[DEF_VAR_REV]](V, X) $Macro[[DEF_VAR]](X, V)
       #define $Macro_decl[[CPY]](X) X
       #define $Macro_decl[[DEF_VAR_TYPE]](X, Y) X Y
       #define $Macro_decl[[SOME_NAME]] variable
@@ -431,7 +431,7 @@ TEST(SemanticHighlighting, GetsCorrectTokens) {
     )cpp",
       R"cpp(
       #define $Macro_decl[[fail]](expr) expr
-      #define $Macro_decl[[assert]](COND) if (!(COND)) { fail("assertion failed" #COND); }
+      #define $Macro_decl[[assert]](COND) if (!(COND)) { $Macro[[fail]]("assertion failed" #COND); }
       // Preamble ends.
       int $Variable_def[[x]];
       int $Variable_def[[y]];

From 4dc72d47ce88218ff3c6a7ae724beb6ab3ba2ade Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Tue, 21 Mar 2023 14:41:20 -0700
Subject: [PATCH 098/208] [mlir][Tensor] Add a FoldTensorSubsetOps pass and
 patterns

These patterns follow FoldMemRefAliasOps which is further refactored for reuse.
In the process, fix FoldMemRefAliasOps handling of strides for vector.transfer ops which was previously incorrect.

These opt-in patterns generalize the existing canonicalizations on vector.transfer ops.
In the future the blanket canonicalizations will be retired.
They are kept for now to minimize porting disruptions.

Differential Revision: https://reviews.llvm.org/D146624
---
 .../Dialect/Affine/ViewLikeInterfaceUtils.h   |  27 ++
 .../mlir/Dialect/Tensor/IR/TensorOps.td       |   4 +
 .../mlir/Dialect/Tensor/Transforms/Passes.h   |  14 +-
 .../mlir/Dialect/Tensor/Transforms/Passes.td  |  18 +-
 .../Dialect/Tensor/Transforms/Transforms.h    |  25 +-
 mlir/include/mlir/IR/AffineMap.h              |  18 +-
 .../Affine/Utils/ViewLikeInterfaceUtils.cpp   |  32 +++
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  | 168 +++++------
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp      |  20 ++
 .../Dialect/Tensor/Transforms/Bufferize.cpp   |   2 +-
 .../Dialect/Tensor/Transforms/CMakeLists.txt  |   2 +
 .../Tensor/Transforms/FoldTensorSubsetOps.cpp | 173 ++++++++++++
 ...eConsecutiveInsertExtractSlicePatterns.cpp |   2 +
 mlir/lib/Dialect/Tensor/Utils/CMakeLists.txt  |   1 +
 mlir/lib/Dialect/Tensor/Utils/Utils.cpp       |   1 +
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |  21 +-
 mlir/lib/IR/AffineMap.cpp                     |  20 ++
 .../Dialect/MemRef/fold-memref-alias-ops.mlir |  55 ++--
 .../Tensor/fold-tensor-subset-ops.mlir        | 262 ++++++++++++++++++
 .../llvm-project-overlay/mlir/BUILD.bazel     |   2 +
 20 files changed, 711 insertions(+), 156 deletions(-)
 create mode 100644 mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
 create mode 100644 mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir

diff --git a/mlir/include/mlir/Dialect/Affine/ViewLikeInterfaceUtils.h b/mlir/include/mlir/Dialect/Affine/ViewLikeInterfaceUtils.h
index 3fac9409bf381..42156ac5de24d 100644
--- a/mlir/include/mlir/Dialect/Affine/ViewLikeInterfaceUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/ViewLikeInterfaceUtils.h
@@ -13,6 +13,7 @@
 #include "mlir/Interfaces/ViewLikeInterface.h"
 
 namespace mlir {
+class RewriterBase;
 
 /// Fills the `combinedOffsets`, `combinedSizes` and `combinedStrides` to use
 /// when combining a producer slice **into** a consumer slice.
@@ -21,6 +22,7 @@ namespace mlir {
 /// - Combined offsets = producer_offsets * consumer_strides + consumer_offsets
 /// - Combined sizes = consumer_sizes
 /// - Combined strides = producer_strides * consumer_strides
+// TODO: unify this API with resolveSourceIndicesOffsetsAndStrides or deprecate.
 LogicalResult
 mergeOffsetsSizesAndStrides(OpBuilder &builder, Location loc,
                             ArrayRef<OpFoldResult> producerOffsets,
@@ -36,6 +38,7 @@ mergeOffsetsSizesAndStrides(OpBuilder &builder, Location loc,
 
 /// Fills the `combinedOffsets`, `combinedSizes` and `combinedStrides` to use
 /// when combining a `producer` slice op **into** a `consumer` slice op.
+// TODO: unify this API with resolveSourceIndicesOffsetsAndStrides or deprecate.
 LogicalResult
 mergeOffsetsSizesAndStrides(OpBuilder &builder, Location loc,
                             OffsetSizeAndStrideOpInterface producer,
@@ -45,6 +48,30 @@ mergeOffsetsSizesAndStrides(OpBuilder &builder, Location loc,
                             SmallVector<OpFoldResult> &combinedSizes,
                             SmallVector<OpFoldResult> &combinedStrides);
 
+/// Given the 'indicesVals' of a load/store operation operating on an op with
+/// offsets and strides, return the combined indices.
+///
+/// For example, using `memref.load` and `memref.subview` as an illustration:
+///
+/// ```
+///    %0 = ... : memref<12x42xf32>
+///    %1 = memref.subview %0[%arg0, %arg1][...][%stride1, %stride2] :
+///      memref<12x42xf32> to memref<4x4xf32, offset=?, strides=[?, ?]>
+///    %2 = load %1[%i1, %i2] : memref<4x4xf32, offset=?, strides=[?, ?]>
+/// ```
+///
+/// could be folded into:
+///
+/// ```
+///    %2 = load %0[%arg0 + %i1 * %stride1][%arg1 + %i2 * %stride2] :
+///         memref<12x42xf32>
+/// ```
+void resolveSourceIndicesOffsetsAndStrides(
+    RewriterBase &rewriter, Location loc, ArrayRef<OpFoldResult> mixedOffsets,
+    ArrayRef<OpFoldResult> mixedStrides,
+    const llvm::SmallBitVector &rankReducedDims, ValueRange indicesVals,
+    SmallVectorImpl<Value> &sourceIndices);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_AFFINE_VIEWLIKEINTERFACEUTILS_H
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 66d6dcc7b27ed..721615fdd2607 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -858,6 +858,10 @@ def Tensor_InsertSliceOp : Tensor_OpWithOffsetSizesAndStrides<"insert_slice", [
       return {rank, rank, rank};
     }
 
+    /// Return the dimensions of the dest that are omitted to insert a source
+    /// when the result is rank-extended.
+    llvm::SmallBitVector getDroppedDims();
+
     /// Return the number of leading operands before the `offsets`, `sizes` and
     /// and `strides` operands.
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; }
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
index df695dbec19a7..48f9066934a25 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
@@ -12,23 +12,27 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+namespace tensor {
 
-#define GEN_PASS_DECL
-#include "mlir/Dialect/Tensor/Transforms/Passes.h.inc"
+//===----------------------------------------------------------------------===//
+// Passes
+//===----------------------------------------------------------------------===//
 
-/// Creates an instance of `tensor` dialect bufferization pass.
+/// Creates an instance of the `tensor` subset folding pass.
+std::unique_ptr<Pass> createFoldTensorSubsetOpsPass();
+
+/// Creates an instance of the `tensor` dialect bufferization pass.
 std::unique_ptr<Pass> createTensorBufferizePass();
 
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
 
-namespace tensor {
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/Tensor/Transforms/Passes.h.inc"
-} // namespace tensor
 
+} // namespace tensor
 } // namespace mlir
 
 #endif // MLIR_DIALECT_TENSOR_TRANSFORMS_PASSES_H_
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
index 2bf774d404bf5..b4673599a5def 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
@@ -11,9 +11,25 @@
 
 include "mlir/Pass/PassBase.td"
 
+def FoldTensorSubsetOps : Pass<"fold-tensor-subset-ops"> {
+  let summary = "Fold tensor subset ops into producer/consumer ops";
+  let description = [{
+    The pass folds tensor subset ops into producer/consumer ops.
+
+    At the moment, the following foldings occur when possible:
+      - tensor.extract_slice into vector.transfer_read
+      - vector.transfer_write into tensor.insert_slice
+
+  }];
+  let constructor = "mlir::tensor::createFoldTensorSubsetOpsPass()";
+  let dependentDialects = [
+      "AffineDialect", "tensor::TensorDialect", "vector::VectorDialect"
+  ];
+}
+
 def TensorBufferize : Pass<"tensor-bufferize", "func::FuncOp"> {
   let summary = "Bufferize the `tensor` dialect";
-  let constructor = "mlir::createTensorBufferizePass()";
+  let constructor = "mlir::tensor::createTensorBufferizePass()";
 }
 
 #endif // MLIR_DIALECT_TENSOR_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
index 4cdf360c51d72..c0c46e9981dfa 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
@@ -18,11 +18,9 @@ struct TilingResult;
 
 namespace tensor {
 
-/// Populates `patterns` with patterns to wrap a tensor.pad op with an scf.if op
-/// to separate the cases where we don't need padding (all pad sizes are
-/// actually zeros) and where we indeed need padding.
-void populateSplitPaddingPatterns(RewritePatternSet &patterns,
-                                  PatternBenefit baseBenefit = 1);
+//===----------------------------------------------------------------------===//
+// Patterns
+//===----------------------------------------------------------------------===//
 
 /// Pattern to swap an `tensor.extract_slice` with its producer when the
 /// producer implements the `TilingInterface`. The pattern itself does not
@@ -32,6 +30,23 @@ void populateSplitPaddingPatterns(RewritePatternSet &patterns,
 FailureOr<TilingResult> replaceExtractSliceWithTiledProducer(
     OpBuilder &builder, tensor::ExtractSliceOp sliceOp, OpResult producerOp);
 
+//===----------------------------------------------------------------------===//
+// Populate functions.
+//===----------------------------------------------------------------------===//
+
+/// Collects a set of patterns to rewrite ops within the tensor dialect.
+void populateExpandOpsPatterns(RewritePatternSet &patterns);
+
+/// Appends patterns for folding tensor aliasing ops into consumer load/store
+/// ops into `patterns`.
+void populateFoldTensorSubsetOpPatterns(RewritePatternSet &patterns);
+
+/// Populates `patterns` with patterns to wrap a tensor.pad op with an scf.if op
+/// to separate the cases where we don't need padding (all pad sizes are
+/// actually zeros) and where we indeed need padding.
+void populateSplitPaddingPatterns(RewritePatternSet &patterns,
+                                  PatternBenefit baseBenefit = 1);
+
 /// Collects patterns to merge consecutive tensor.insert_slice/extract_slice
 /// into one. These patterns are in in this separate entry point because the
 /// bufferization is sensitive over IR structure, particularly those
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index cc7c794f1f933..75a268c483955 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -249,11 +249,11 @@ class AffineMap {
 
   /// Returns a new AffineMap with the same number of dims and symbols and one
   /// less result at `pos`, dropped.
-  AffineMap dropResult(int64_t pos) { return dropResults({pos}); }
+  AffineMap dropResult(int64_t pos) const { return dropResults({pos}); }
 
   // Returns a new AffineMap with the same number of dims and symbols, but all
-  // positions in `positions` dropped from results.
-  AffineMap dropResults(ArrayRef<int64_t> positions) {
+  // results in `positions` dropped.
+  AffineMap dropResults(ArrayRef<int64_t> positions) const {
     SmallVector<int64_t> reverse_sorted_positions = llvm::to_vector(positions);
     llvm::sort(reverse_sorted_positions, std::greater<int64_t>());
 
@@ -263,9 +263,13 @@ class AffineMap {
     return AffineMap::get(getNumDims(), getNumSymbols(), exprs, getContext());
   }
 
+  // Returns a new AffineMap with the same number of dims and symbols, but all
+  // results in `positions` dropped.
+  AffineMap dropResults(const llvm::SmallBitVector &positions) const;
+
   /// Returns a new AffineMap with the same number of dims and symbols and an
   /// extra result inserted at `pos`.
-  AffineMap insertResult(AffineExpr expr, unsigned pos) {
+  AffineMap insertResult(AffineExpr expr, unsigned pos) const {
     auto exprs = llvm::to_vector<4>(getResults());
     exprs.insert(exprs.begin() + pos, expr);
     return AffineMap::get(getNumDims(), getNumSymbols(), exprs, getContext());
@@ -583,6 +587,12 @@ llvm::SmallBitVector getUnusedDimsBitVector(ArrayRef<AffineMap> maps);
 // by any of the maps in the input array `maps`.
 llvm::SmallBitVector getUnusedSymbolsBitVector(ArrayRef<AffineMap> maps);
 
+/// Expand `map` to operate on `rank` dims while projecting out the dims in
+/// `projectedDimensions`. This amounts to composing `map` with
+/// `id(rank).dropResults(projectedDimensions)`.
+AffineMap expandDimsToRank(AffineMap map, int64_t rank,
+                           const llvm::SmallBitVector &projectedDimensions);
+
 inline raw_ostream &operator<<(raw_ostream &os, AffineMap map) {
   map.print(os);
   return os;
diff --git a/mlir/lib/Dialect/Affine/Utils/ViewLikeInterfaceUtils.cpp b/mlir/lib/Dialect/Affine/Utils/ViewLikeInterfaceUtils.cpp
index c506239744c48..f53edcefe3c79 100644
--- a/mlir/lib/Dialect/Affine/Utils/ViewLikeInterfaceUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/ViewLikeInterfaceUtils.cpp
@@ -8,6 +8,8 @@
 
 #include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
 
 using namespace mlir;
 
@@ -74,3 +76,33 @@ LogicalResult mlir::mergeOffsetsSizesAndStrides(
       droppedProducerDims, consumerOffsets, consumerSizes, consumerStrides,
       combinedOffsets, combinedSizes, combinedStrides);
 }
+
+void mlir::resolveSourceIndicesOffsetsAndStrides(
+    RewriterBase &rewriter, Location loc, ArrayRef<OpFoldResult> mixedOffsets,
+    ArrayRef<OpFoldResult> mixedStrides,
+    const llvm::SmallBitVector &rankReducedDims, ValueRange indicesVals,
+    SmallVectorImpl<Value> &sourceIndices) {
+  OpFoldResult zero = rewriter.getIndexAttr(0);
+
+  // For each dimension that is rank-reduced, add a zero to the indices.
+  int64_t indicesDim = 0;
+  SmallVector<OpFoldResult> indices;
+  for (auto dim : llvm::seq<int64_t>(0, mixedOffsets.size())) {
+    OpFoldResult ofr =
+        (rankReducedDims.test(dim)) ? zero : indicesVals[indicesDim++];
+    indices.push_back(ofr);
+  }
+
+  sourceIndices.resize(indices.size());
+  sourceIndices.clear();
+  for (auto [offset, index, stride] :
+       llvm::zip_equal(mixedOffsets, indices, mixedStrides)) {
+    AffineExpr off, idx, str;
+    bindSymbols(rewriter.getContext(), off, idx, str);
+    OpFoldResult ofr = makeComposedFoldedAffineApply(
+        rewriter, loc, AffineMap::get(0, 3, off + idx * str),
+        {offset, index, stride});
+    sourceIndices.push_back(
+        getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
+  }
+}
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index c1c3478b06efc..c850348c85480 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -19,7 +20,9 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineMap.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -150,70 +153,6 @@ resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
   return success();
 }
 
-/// Given the 'indices' of an load/store operation where the memref is a result
-/// of a subview op, returns the indices w.r.t to the source memref of the
-/// subview op. For example
-///
-/// %0 = ... : memref<12x42xf32>
-/// %1 = subview %0[%arg0, %arg1][][%stride1, %stride2] : memref<12x42xf32> to
-///          memref<4x4xf32, offset=?, strides=[?, ?]>
-/// %2 = load %1[%i1, %i2] : memref<4x4xf32, offset=?, strides=[?, ?]>
-///
-/// could be folded into
-///
-/// %2 = load %0[%arg0 + %i1 * %stride1][%arg1 + %i2 * %stride2] :
-///          memref<12x42xf32>
-static LogicalResult
-resolveSourceIndicesSubView(Location loc, PatternRewriter &rewriter,
-                            memref::SubViewOp subViewOp, ValueRange indices,
-                            SmallVectorImpl<Value> &sourceIndices) {
-  SmallVector<OpFoldResult> mixedOffsets = subViewOp.getMixedOffsets();
-  SmallVector<OpFoldResult> mixedSizes = subViewOp.getMixedSizes();
-  SmallVector<OpFoldResult> mixedStrides = subViewOp.getMixedStrides();
-
-  SmallVector<Value> useIndices;
-  // Check if this is rank-reducing case. Then for every unit-dim size add a
-  // zero to the indices.
-  int64_t resultDim = 0;
-  llvm::SmallBitVector unusedDims = subViewOp.getDroppedDims();
-  for (auto dim : llvm::seq<int64_t>(0, subViewOp.getSourceType().getRank())) {
-    if (unusedDims.test(dim))
-      useIndices.push_back(rewriter.create<arith::ConstantIndexOp>(loc, 0));
-    else
-      useIndices.push_back(indices[resultDim++]);
-  }
-  if (useIndices.size() != mixedOffsets.size())
-    return failure();
-  sourceIndices.resize(useIndices.size());
-  for (auto index : llvm::seq<size_t>(0, mixedOffsets.size())) {
-    SmallVector<OpFoldResult> dynamicOperands;
-    AffineExpr expr = rewriter.getAffineDimExpr(0);
-    int64_t numSymbols = 0;
-    dynamicOperands.push_back(useIndices[index]);
-
-    // Multiply the stride;
-    if (auto attr = mixedStrides[index].dyn_cast<Attribute>()) {
-      expr = expr * attr.cast<IntegerAttr>().getInt();
-    } else {
-      dynamicOperands.push_back(mixedStrides[index].get<Value>());
-      expr = expr * rewriter.getAffineSymbolExpr(numSymbols++);
-    }
-
-    // Add the offset.
-    if (auto attr = mixedOffsets[index].dyn_cast<Attribute>()) {
-      expr = expr + attr.cast<IntegerAttr>().getInt();
-    } else {
-      dynamicOperands.push_back(mixedOffsets[index].get<Value>());
-      expr = expr + rewriter.getAffineSymbolExpr(numSymbols++);
-    }
-    Location loc = subViewOp.getLoc();
-    OpFoldResult ofr = makeComposedFoldedAffineApply(
-        rewriter, loc, AffineMap::get(1, numSymbols, expr), dynamicOperands);
-    sourceIndices[index] = getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
-  }
-  return success();
-}
-
 /// Helpers to access the memref operand for each op.
 template <typename LoadOrStoreOpTy>
 static Value getMemRefOperand(LoadOrStoreOpTy op) {
@@ -236,25 +175,6 @@ static Value getMemRefOperand(gpu::SubgroupMmaStoreMatrixOp op) {
   return op.getDstMemref();
 }
 
-/// Given the permutation map of the original
-/// `vector.transfer_read`/`vector.transfer_write` operations compute the
-/// permutation map to use after the subview is folded with it.
-static AffineMapAttr getPermutationMapAttr(MLIRContext *context,
-                                           memref::SubViewOp subViewOp,
-                                           AffineMap currPermutationMap) {
-  llvm::SmallBitVector unusedDims = subViewOp.getDroppedDims();
-  SmallVector<AffineExpr> exprs;
-  int64_t sourceRank = subViewOp.getSourceType().getRank();
-  for (auto dim : llvm::seq<int64_t>(0, sourceRank)) {
-    if (unusedDims.test(dim))
-      continue;
-    exprs.push_back(getAffineDimExpr(dim, context));
-  }
-  auto resultDimToSourceDimMap = AffineMap::get(sourceRank, 0, exprs, context);
-  return AffineMapAttr::get(
-      currPermutationMap.compose(resultDimToSourceDimMap));
-}
-
 //===----------------------------------------------------------------------===//
 // Patterns
 //===----------------------------------------------------------------------===//
@@ -390,6 +310,42 @@ calculateExpandedAccessIndices(AffineMap affineMap,
   return expandedIndices;
 }
 
+template <typename XferOp>
+static LogicalResult
+preconditionsFoldSubViewOpImpl(RewriterBase &rewriter, XferOp xferOp,
+                               memref::SubViewOp subviewOp) {
+  static_assert(
+      !llvm::is_one_of<vector::TransferReadOp, vector::TransferWriteOp>::value,
+      "must be a vector transfer op");
+  if (xferOp.hasOutOfBoundsDim())
+    return rewriter.notifyMatchFailure(xferOp, "out of bounds transfer dim");
+  if (xferOp.getMask())
+    return rewriter.notifyMatchFailure(xferOp, "masked transfer");
+  if (!subviewOp.hasUnitStride()) {
+    return rewriter.notifyMatchFailure(
+        xferOp, "non-1 stride subview, need to track strides in folded memref");
+  }
+  return success();
+}
+
+static LogicalResult preconditionsFoldSubViewOp(RewriterBase &rewriter,
+                                                Operation *op,
+                                                memref::SubViewOp subviewOp) {
+  return success();
+}
+
+static LogicalResult preconditionsFoldSubViewOp(RewriterBase &rewriter,
+                                                vector::TransferReadOp readOp,
+                                                memref::SubViewOp subviewOp) {
+  return preconditionsFoldSubViewOpImpl(rewriter, readOp, subviewOp);
+}
+
+static LogicalResult preconditionsFoldSubViewOp(RewriterBase &rewriter,
+                                                vector::TransferWriteOp writeOp,
+                                                memref::SubViewOp subviewOp) {
+  return preconditionsFoldSubViewOpImpl(rewriter, writeOp, subviewOp);
+}
+
 template <typename OpTy>
 LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
     OpTy loadOp, PatternRewriter &rewriter) const {
@@ -397,7 +353,12 @@ LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
       getMemRefOperand(loadOp).template getDefiningOp<memref::SubViewOp>();
 
   if (!subViewOp)
-    return failure();
+    return rewriter.notifyMatchFailure(loadOp, "not a subview producer");
+
+  LogicalResult preconditionResult =
+      preconditionsFoldSubViewOp(rewriter, loadOp, subViewOp);
+  if (failed(preconditionResult))
+    return preconditionResult;
 
   SmallVector<Value> indices(loadOp.getIndices().begin(),
                              loadOp.getIndices().end());
@@ -410,9 +371,10 @@ LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
     indices.assign(expandedIndices.begin(), expandedIndices.end());
   }
   SmallVector<Value> sourceIndices;
-  if (failed(resolveSourceIndicesSubView(loadOp.getLoc(), rewriter, subViewOp,
-                                         indices, sourceIndices)))
-    return failure();
+  resolveSourceIndicesOffsetsAndStrides(
+      rewriter, loadOp.getLoc(), subViewOp.getMixedOffsets(),
+      subViewOp.getMixedStrides(), subViewOp.getDroppedDims(), indices,
+      sourceIndices);
 
   llvm::TypeSwitch<Operation *, void>(loadOp)
       .Case([&](AffineLoadOp op) {
@@ -423,14 +385,13 @@ LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
         rewriter.replaceOpWithNewOp<memref::LoadOp>(
             loadOp, subViewOp.getSource(), sourceIndices, op.getNontemporal());
       })
-      .Case([&](vector::TransferReadOp transferReadOp) {
+      .Case([&](vector::TransferReadOp op) {
         rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
-            transferReadOp, transferReadOp.getVectorType(),
-            subViewOp.getSource(), sourceIndices,
-            getPermutationMapAttr(rewriter.getContext(), subViewOp,
-                                  transferReadOp.getPermutationMap()),
-            transferReadOp.getPadding(),
-            /*mask=*/Value(), transferReadOp.getInBoundsAttr());
+            op, op.getVectorType(), subViewOp.getSource(), sourceIndices,
+            AffineMapAttr::get(expandDimsToRank(
+                op.getPermutationMap(), subViewOp.getSourceType().getRank(),
+                subViewOp.getDroppedDims())),
+            op.getPadding(), /*mask=*/Value(), op.getInBoundsAttr());
       })
       .Case([&](gpu::SubgroupMmaLoadMatrixOp op) {
         rewriter.replaceOpWithNewOp<gpu::SubgroupMmaLoadMatrixOp>(
@@ -512,7 +473,12 @@ LogicalResult StoreOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
       getMemRefOperand(storeOp).template getDefiningOp<memref::SubViewOp>();
 
   if (!subViewOp)
-    return failure();
+    return rewriter.notifyMatchFailure(storeOp, "not a subview producer");
+
+  LogicalResult preconditionResult =
+      preconditionsFoldSubViewOp(rewriter, storeOp, subViewOp);
+  if (failed(preconditionResult))
+    return preconditionResult;
 
   SmallVector<Value> indices(storeOp.getIndices().begin(),
                              storeOp.getIndices().end());
@@ -525,9 +491,10 @@ LogicalResult StoreOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
     indices.assign(expandedIndices.begin(), expandedIndices.end());
   }
   SmallVector<Value> sourceIndices;
-  if (failed(resolveSourceIndicesSubView(storeOp.getLoc(), rewriter, subViewOp,
-                                         indices, sourceIndices)))
-    return failure();
+  resolveSourceIndicesOffsetsAndStrides(
+      rewriter, storeOp.getLoc(), subViewOp.getMixedOffsets(),
+      subViewOp.getMixedStrides(), subViewOp.getDroppedDims(), indices,
+      sourceIndices);
 
   llvm::TypeSwitch<Operation *, void>(storeOp)
       .Case([&](AffineStoreOp op) {
@@ -542,8 +509,9 @@ LogicalResult StoreOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
       .Case([&](vector::TransferWriteOp op) {
         rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
             op, op.getValue(), subViewOp.getSource(), sourceIndices,
-            getPermutationMapAttr(rewriter.getContext(), subViewOp,
-                                  op.getPermutationMap()),
+            AffineMapAttr::get(expandDimsToRank(
+                op.getPermutationMap(), subViewOp.getSourceType().getRank(),
+                subViewOp.getDroppedDims())),
             op.getInBoundsAttr());
       })
       .Case([&](gpu::SubgroupMmaStoreMatrixOp op) {
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 9d26e51e04fd5..93db7da27abdd 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -2396,6 +2396,26 @@ struct InsertSliceOpSourceCastInserter final
 };
 } // namespace
 
+llvm::SmallBitVector InsertSliceOp::getDroppedDims() {
+  ArrayRef<int64_t> resultShape = getType().getShape();
+  SmallVector<OpFoldResult> mixedSizes = getMixedSizes();
+  llvm::SmallBitVector droppedDims(mixedSizes.size());
+  unsigned shapePos = 0;
+  for (const auto &size : enumerate(mixedSizes)) {
+    std::optional<int64_t> sizeVal = getConstantIntValue(size.value());
+    // If the size is not 1, or if the current matched dimension of the result
+    // is the same static shape as the size value (which is 1), then the
+    // dimension is preserved.
+    if (!sizeVal || *sizeVal != 1 ||
+        (shapePos < resultShape.size() && resultShape[shapePos] == 1)) {
+      shapePos++;
+      continue;
+    }
+    droppedDims.set(size.index());
+  }
+  return droppedDims;
+}
+
 void InsertSliceOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                 MLIRContext *context) {
   results.add<InsertSliceOpConstantArgumentFolder<InsertSliceOp>,
diff --git a/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
index 426b1363c6a0e..d27c4576a8b7a 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
@@ -53,6 +53,6 @@ struct TensorBufferizePass
 };
 } // namespace
 
-std::unique_ptr<Pass> mlir::createTensorBufferizePass() {
+std::unique_ptr<Pass> mlir::tensor::createTensorBufferizePass() {
   return std::make_unique<TensorBufferizePass>();
 }
diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
index 5ed3d97b2719f..9f6780730dc71 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_dialect_library(MLIRTensorTransforms
   EmptyOpPatterns.cpp
   ExtractSliceFromReshapeUtils.cpp
   FoldIntoPackAndUnpackPatterns.cpp
+  FoldTensorSubsetOps.cpp
   MergeConsecutiveInsertExtractSlicePatterns.cpp
   ReshapePatterns.cpp
   SplitPaddingPatterns.cpp
@@ -29,4 +30,5 @@ add_mlir_dialect_library(MLIRTensorTransforms
   MLIRTensorDialect
   MLIRTilingInterface
   MLIRTransforms
+  MLIRVectorDialect
 )
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
new file mode 100644
index 0000000000000..80ecb868dff6a
--- /dev/null
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -0,0 +1,173 @@
+//===- FoldTensorSubsetOps.cpp - Fold tensor subset ops -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Fold tensor subset ops with producer / consumers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/Passes.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+namespace mlir {
+namespace tensor {
+#define GEN_PASS_DEF_FOLDTENSORSUBSETOPS
+#include "mlir/Dialect/Tensor/Transforms/Passes.h.inc"
+} // namespace tensor
+} // namespace mlir
+
+using namespace mlir;
+
+static Value getTensorOperand(vector::TransferReadOp op) {
+  return op.getSource();
+}
+
+static Value getTensorOperand(tensor::InsertSliceOp op) {
+  return op.getSource();
+}
+
+//===----------------------------------------------------------------------===//
+// Patterns
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Merge extract_slice operation with load/transferRead operation.
+class TransferReadOfExtractSliceOpFolder final
+    : public OpRewritePattern<vector::TransferReadOp> {
+public:
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
+                                PatternRewriter &rewriter) const override;
+};
+
+/// Merge insert_slice operation with store/transferWriteOp operation.
+class InsertSliceOfTransferWriteOpFolder final
+    : public OpRewritePattern<tensor::InsertSliceOp> {
+public:
+  using OpRewritePattern<tensor::InsertSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::InsertSliceOp insertSliceOp,
+                                PatternRewriter &rewriter) const override;
+};
+} // namespace
+
+template <typename XferOp, typename ExtractOrInsertOp>
+static LogicalResult preconditionsFoldExtractOrInsertWithTransferOp(
+    RewriterBase &rewriter, XferOp xferOp,
+    ExtractOrInsertOp extractOrInsertSliceOp) {
+  if (xferOp.hasOutOfBoundsDim())
+    return rewriter.notifyMatchFailure(xferOp, "out of bounds transfer dim");
+  if (xferOp.getMask())
+    return rewriter.notifyMatchFailure(xferOp, "masked transfer");
+  if (!extractOrInsertSliceOp.hasUnitStride()) {
+    return rewriter.notifyMatchFailure(
+        xferOp, "non-1 stride insert/extract, requires keeping track of "
+                "strides, this may result in needing to insert "
+                "vector.insert_strided_slice/extract_strided_slice ops");
+  }
+  return success();
+}
+
+LogicalResult TransferReadOfExtractSliceOpFolder::matchAndRewrite(
+    vector::TransferReadOp readOp, PatternRewriter &rewriter) const {
+  auto extractSliceOp =
+      getTensorOperand(readOp).getDefiningOp<tensor::ExtractSliceOp>();
+  if (!extractSliceOp)
+    return rewriter.notifyMatchFailure(readOp, "not an extract_slice");
+
+  LogicalResult preconditionResult =
+      preconditionsFoldExtractOrInsertWithTransferOp(rewriter, readOp,
+                                                     extractSliceOp);
+  if (failed(preconditionResult))
+    return preconditionResult;
+
+  SmallVector<Value> indices(readOp.getIndices().begin(),
+                             readOp.getIndices().end());
+  SmallVector<Value> sourceIndices;
+  resolveSourceIndicesOffsetsAndStrides(
+      rewriter, readOp.getLoc(), extractSliceOp.getMixedOffsets(),
+      extractSliceOp.getMixedStrides(), extractSliceOp.getDroppedDims(),
+      indices, sourceIndices);
+
+  rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
+      readOp, readOp.getVectorType(), extractSliceOp.getSource(), sourceIndices,
+      AffineMapAttr::get(expandDimsToRank(
+          readOp.getPermutationMap(), extractSliceOp.getSourceType().getRank(),
+          extractSliceOp.getDroppedDims())),
+      readOp.getPadding(),
+      /*mask=*/Value(), readOp.getInBoundsAttr());
+
+  return success();
+}
+
+LogicalResult InsertSliceOfTransferWriteOpFolder::matchAndRewrite(
+    tensor::InsertSliceOp insertSliceOp, PatternRewriter &rewriter) const {
+  auto writeOp = getTensorOperand(insertSliceOp)
+                     .template getDefiningOp<vector::TransferWriteOp>();
+  if (!writeOp)
+    return rewriter.notifyMatchFailure(insertSliceOp, "not a transfer_write");
+
+  LogicalResult preconditionResult =
+      preconditionsFoldExtractOrInsertWithTransferOp(rewriter, writeOp,
+                                                     insertSliceOp);
+  if (failed(preconditionResult))
+    return preconditionResult;
+
+  SmallVector<Value> indices(writeOp.getIndices().begin(),
+                             writeOp.getIndices().end());
+  SmallVector<Value> sourceIndices;
+  resolveSourceIndicesOffsetsAndStrides(
+      rewriter, writeOp.getLoc(), insertSliceOp.getMixedOffsets(),
+      insertSliceOp.getMixedStrides(), insertSliceOp.getDroppedDims(), indices,
+      sourceIndices);
+
+  rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
+      insertSliceOp, writeOp.getValue(), insertSliceOp.getDest(), sourceIndices,
+      AffineMapAttr::get(expandDimsToRank(writeOp.getPermutationMap(),
+                                          insertSliceOp.getDestType().getRank(),
+                                          insertSliceOp.getDroppedDims())),
+      writeOp.getInBoundsAttr());
+
+  return success();
+}
+
+void tensor::populateFoldTensorSubsetOpPatterns(RewritePatternSet &patterns) {
+  patterns.add<TransferReadOfExtractSliceOpFolder,
+               InsertSliceOfTransferWriteOpFolder>(patterns.getContext());
+}
+//===----------------------------------------------------------------------===//
+// Pass registration
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct FoldTensorSubsetOpsPass final
+    : public tensor::impl::FoldTensorSubsetOpsBase<FoldTensorSubsetOpsPass> {
+  void runOnOperation() override;
+};
+
+} // namespace
+
+void FoldTensorSubsetOpsPass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  tensor::populateFoldTensorSubsetOpPatterns(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+}
+
+std::unique_ptr<Pass> tensor::createFoldTensorSubsetOpsPass() {
+  return std::make_unique<FoldTensorSubsetOpsPass>();
+}
diff --git a/mlir/lib/Dialect/Tensor/Transforms/MergeConsecutiveInsertExtractSlicePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/MergeConsecutiveInsertExtractSlicePatterns.cpp
index 4169882046556..895d1b1f02f07 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/MergeConsecutiveInsertExtractSlicePatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/MergeConsecutiveInsertExtractSlicePatterns.cpp
@@ -18,6 +18,7 @@ using namespace mlir::tensor;
 
 namespace {
 /// Merges consecutive tensor.extract_slice ops into one.
+// TODO: move to FoldTensorSubsetOps and unify APIs with FoldMemRefAliasOps.
 struct MergeConsecutiveExtractSlice : public OpRewritePattern<ExtractSliceOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -41,6 +42,7 @@ struct MergeConsecutiveExtractSlice : public OpRewritePattern<ExtractSliceOp> {
 };
 
 /// Merges consecutive tensor.insert_slice ops into one.
+// TODO: move to FoldTensorSubsetOps and unify APIs with FoldMemRefAliasOps.
 template <typename OpTy>
 struct MergeConsecutiveInsertSlice : public OpRewritePattern<OpTy> {
   using OpRewritePattern<OpTy>::OpRewritePattern;
diff --git a/mlir/lib/Dialect/Tensor/Utils/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Utils/CMakeLists.txt
index efc78420b9e17..b7848b1a44229 100644
--- a/mlir/lib/Dialect/Tensor/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/Utils/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRTensorUtils
   LINK_LIBS PUBLIC
   MLIRAffineDialect
   MLIRArithDialect
+  MLIRArithUtils
   MLIRIR
   MLIRTensorDialect
 )
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index a5847250fa915..4c09c540bde2e 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 21daff60c7e62..ce7d1844ac7f1 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -3733,6 +3733,8 @@ namespace {
 /// %1 = vector.transfer_read %t[%p0, %p1], %cst {in_bounds = [true, true]}
 ///     : tensor<?x?xf32>, vector<4x5xf32>
 /// ```
+// TODO: this is brittle and should be deprecated in favor of a more general
+// pattern that applies on-demand.
 struct FoldExtractSliceIntoTransferRead
     : public OpRewritePattern<TransferReadOp> {
 public:
@@ -3883,9 +3885,13 @@ struct TransferReadAfterWriteToBroadcast
 
 void TransferReadOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                  MLIRContext *context) {
-  results
-      .add<FoldExtractSliceIntoTransferRead, TransferReadAfterWriteToBroadcast>(
-          context);
+  // clang-format off
+  results.add <
+               // TODO: this is brittle and should be deprecated in favor of a
+               // more general pattern that applies on-demand.
+               FoldExtractSliceIntoTransferRead,
+               TransferReadAfterWriteToBroadcast>(context);
+  // clang-format on
 }
 
 //===----------------------------------------------------------------------===//
@@ -4235,6 +4241,8 @@ class FoldWaw final : public OpRewritePattern<TransferWriteOp> {
 /// %1 = vector.transfer_write %v, %t2[%a, %b] {in_bounds = [true, true]}
 ///     : vector<4x5xf32>, tensor<?x?xf32>
 /// ```
+// TODO: this is brittle and should be deprecated in favor of a more general
+// pattern that applies on-demand.
 struct FoldInsertSliceIntoTransferWrite
     : public OpRewritePattern<tensor::InsertSliceOp> {
 public:
@@ -4417,8 +4425,13 @@ struct SwapExtractSliceOfTransferWrite
 
 void TransferWriteOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                   MLIRContext *context) {
-  results.add<FoldWaw, FoldInsertSliceIntoTransferWrite,
+  // clang-format off
+  results.add<FoldWaw,
+              // TODO: this is brittle and should be deprecated in favor of a
+              // more general pattern that applies on-demand.
+              FoldInsertSliceIntoTransferWrite,
               SwapExtractSliceOfTransferWrite>(context);
+  // clang-format on
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index 39c8ab96aa662..9ac181f46b578 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/IR/AffineMap.h"
 #include "AffineMapDetail.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Support/LogicalResult.h"
@@ -15,8 +16,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
+#include <iterator>
 #include <numeric>
 #include <optional>
 #include <type_traits>
@@ -467,6 +470,15 @@ AffineMap::replace(const DenseMap<AffineExpr, AffineExpr> &map) const {
   return AffineMap::inferFromExprList(newResults).front();
 }
 
+AffineMap AffineMap::dropResults(const llvm::SmallBitVector &positions) const {
+  auto exprs = llvm::to_vector<4>(getResults());
+  // TODO: this is a pretty terrible API .. is there anything better?
+  for (auto pos = positions.find_last(); pos != -1;
+       pos = positions.find_prev(pos))
+    exprs.erase(exprs.begin() + pos);
+  return AffineMap::get(getNumDims(), getNumSymbols(), exprs, getContext());
+}
+
 AffineMap AffineMap::compose(AffineMap map) const {
   assert(getNumDims() == map.getNumResults() && "Number of results mismatch");
   // Prepare `map` by concatenating the symbols and rewriting its exprs.
@@ -808,6 +820,14 @@ llvm::SmallBitVector mlir::getUnusedSymbolsBitVector(ArrayRef<AffineMap> maps) {
   return numSymbolsBitVector;
 }
 
+AffineMap
+mlir::expandDimsToRank(AffineMap map, int64_t rank,
+                       const llvm::SmallBitVector &projectedDimensions) {
+  auto id = AffineMap::getMultiDimIdentityMap(rank, map.getContext());
+  AffineMap proj = id.dropResults(projectedDimensions);
+  return map.compose(proj);
+}
+
 //===----------------------------------------------------------------------===//
 // MutableAffineMap.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index bcbad20a2fd7a..a29f86eb4a263 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -6,7 +6,7 @@ func.func @fold_static_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg1
   return %1 : f32
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 2)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1] -> (s0  + s1 * 3)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 3)>
 //      CHECK: func @fold_static_stride_subview_with_load
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<12x32xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
@@ -25,7 +25,7 @@ func.func @fold_dynamic_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg
   %1 = memref.load %0[%arg3, %arg4] : memref<4x4xf32, strided<[?, ?], offset: ?>>
   return %1 : f32
 }
-//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s1 + s2 * s0)>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * s2)>
 //      CHECK: func @fold_dynamic_stride_subview_with_load
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<12x32xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
@@ -34,8 +34,8 @@ func.func @fold_dynamic_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg
 // CHECK-SAME:   %[[ARG4:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:   %[[ARG5:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:   %[[ARG6:[a-zA-Z0-9_]+]]: index
-//  CHECK-DAG:   %[[I1:.+]] = affine.apply #[[MAP]]()[%[[ARG5]], %[[ARG1]], %[[ARG3]]]
-//  CHECK-DAG:   %[[I2:.+]] = affine.apply #[[MAP]]()[%[[ARG6]], %[[ARG2]], %[[ARG4]]]
+//  CHECK-DAG:   %[[I1:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG3]], %[[ARG5]]]
+//  CHECK-DAG:   %[[I2:.+]] = affine.apply #[[MAP]]()[%[[ARG2]], %[[ARG4]], %[[ARG6]]]
 //      CHECK:   memref.load %[[ARG0]][%[[I1]], %[[I2]]]
 
 // -----
@@ -66,7 +66,7 @@ func.func @fold_dynamic_stride_subview_with_store(%arg0 : memref<12x32xf32>, %ar
   memref.store %arg7, %0[%arg3, %arg4] : memref<4x4xf32, strided<[?, ?], offset: ?>>
   return
 }
-//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s1 + s2 * s0)>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * s2)>
 //      CHECK: func @fold_dynamic_stride_subview_with_store
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<12x32xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
@@ -75,8 +75,8 @@ func.func @fold_dynamic_stride_subview_with_store(%arg0 : memref<12x32xf32>, %ar
 // CHECK-SAME:   %[[ARG4:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:   %[[ARG5:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:   %[[ARG6:[a-zA-Z0-9_]+]]: index
-//  CHECK-DAG:   %[[I1:.+]] = affine.apply #[[MAP]]()[%[[ARG5]], %[[ARG1]], %[[ARG3]]]
-//  CHECK-DAG:   %[[I2:.+]] = affine.apply #[[MAP]]()[%[[ARG6]], %[[ARG2]], %[[ARG4]]]
+//  CHECK-DAG:   %[[I1:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG3]], %[[ARG5]]]
+//  CHECK-DAG:   %[[I2:.+]] = affine.apply #[[MAP]]()[%[[ARG2]], %[[ARG4]], %[[ARG6]]]
 //      CHECK:   memref.store %{{.+}}, %[[ARG0]][%[[I1]], %[[I2]]]
 
 // -----
@@ -85,7 +85,7 @@ func.func @fold_subview_with_transfer_read_0d(
   %arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index)
     -> vector<f32> {
   %f1 = arith.constant 1.0 : f32
-  %0 = memref.subview %arg0[%arg1, %arg2][1, 1][2, %arg3] : memref<12x32xf32> to memref<f32, strided<[], offset: ?>>
+  %0 = memref.subview %arg0[%arg1, %arg2][1, 1][1, 1] : memref<12x32xf32> to memref<f32, strided<[], offset: ?>>
   %1 = vector.transfer_read %0[], %f1 : memref<f32, strided<[], offset: ?>>, vector<f32>
   return %1 : vector<f32>
 }
@@ -100,22 +100,14 @@ func.func @fold_subview_with_transfer_read_0d(
 
 func.func @fold_subview_with_transfer_read(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) -> vector<4xf32> {
   %f1 = arith.constant 1.0 : f32
+
   %0 = memref.subview %arg0[%arg1, %arg2][4, 4][%arg5, %arg6] : memref<12x32xf32> to memref<4x4xf32, strided<[?, ?], offset: ?>>
   %1 = vector.transfer_read %0[%arg3, %arg4], %f1 {in_bounds = [true]} : memref<4x4xf32, strided<[?, ?], offset: ?>>, vector<4xf32>
   return %1 : vector<4xf32>
 }
-//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s1 + s2 * s0)>
 //      CHECK: func @fold_subview_with_transfer_read
-// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<12x32xf32>
-// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG5:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG6:[a-zA-Z0-9_]+]]: index
-//  CHECK-DAG:   %[[I1:.+]] = affine.apply #[[MAP]]()[%[[ARG5]], %[[ARG1]], %[[ARG3]]]
-//  CHECK-DAG:   %[[I2:.+]] = affine.apply #[[MAP]]()[%[[ARG6]], %[[ARG2]], %[[ARG4]]]
-//      CHECK:   vector.transfer_read %[[ARG0]][%[[I1]], %[[I2]]]
+// Can't fold this atm since we don't emit the proper vector.extract_strided_slice.
+//   CHECK: memref.subview
 
 // -----
 
@@ -123,7 +115,7 @@ func.func @fold_static_stride_subview_with_transfer_write_0d(
     %arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index,
     %v : vector<f32>) {
   %f1 = arith.constant 1.0 : f32
-  %0 = memref.subview %arg0[%arg1, %arg2][1, 1][2, %arg3] : memref<12x32xf32> to memref<f32, strided<[], offset: ?>>
+  %0 = memref.subview %arg0[%arg1, %arg2][1, 1][1, 1] : memref<12x32xf32> to memref<f32, strided<[], offset: ?>>
   vector.transfer_write %v, %0[] {in_bounds = []} : vector<f32>, memref<f32, strided<[], offset: ?>>
   return
 }
@@ -143,18 +135,9 @@ func.func @fold_static_stride_subview_with_transfer_write(%arg0 : memref<12x32xf
   vector.transfer_write %arg7, %0[%arg3, %arg4] {in_bounds = [true]} : vector<4xf32>, memref<4x4xf32, strided<[?, ?], offset: ?>>
   return
 }
-//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s1 + s2 * s0)>
 //      CHECK: func @fold_static_stride_subview_with_transfer_write
-// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<12x32xf32>
-// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG5:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG6:[a-zA-Z0-9_]+]]: index
-//  CHECK-DAG:   %[[I1:.+]] = affine.apply #[[MAP]]()[%[[ARG5]], %[[ARG1]], %[[ARG3]]]
-//  CHECK-DAG:   %[[I2:.+]] = affine.apply #[[MAP]]()[%[[ARG6]], %[[ARG2]], %[[ARG4]]]
-//      CHECK:   vector.transfer_write %{{.+}}, %[[ARG0]][%[[I1]], %[[I2]]]
+// Can't fold this atm since we don't emit the proper vector.extract_strided_slice.
+//   CHECK: memref.subview
 
 // -----
 
@@ -168,7 +151,7 @@ func.func @fold_rank_reducing_subview_with_load
   %1 = memref.load %0[%arg13, %arg14, %arg15, %arg16] : memref<4x1x4x1xf32, strided<[?, ?, ?, ?], offset: ?>>
   return %1 : f32
 }
-//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s1 + s2 * s0)>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * s2)>
 //      CHECK: func @fold_rank_reducing_subview_with_load
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?x?x?x?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
@@ -187,10 +170,10 @@ func.func @fold_rank_reducing_subview_with_load
 // CHECK-SAME:   %[[ARG14:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:   %[[ARG15:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:   %[[ARG16:[a-zA-Z0-9_]+]]: index
-//  CHECK-DAG:   %[[I0:.+]] = affine.apply #[[MAP]]()[%[[ARG7]], %[[ARG1]], %[[ARG13]]]
-//  CHECK-DAG:   %[[I2:.+]] = affine.apply #[[MAP]]()[%[[ARG9]], %[[ARG3]], %[[ARG14]]]
-//  CHECK-DAG:   %[[I3:.+]] = affine.apply #[[MAP]]()[%[[ARG10]], %[[ARG4]], %[[ARG15]]]
-//  CHECK-DAG:   %[[I4:.+]] = affine.apply #[[MAP]]()[%[[ARG11]], %[[ARG5]], %[[ARG16]]]
+//  CHECK-DAG:   %[[I0:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG13]], %[[ARG7]]]
+//  CHECK-DAG:   %[[I2:.+]] = affine.apply #[[MAP]]()[%[[ARG3]], %[[ARG14]], %[[ARG9]]]
+//  CHECK-DAG:   %[[I3:.+]] = affine.apply #[[MAP]]()[%[[ARG4]], %[[ARG15]], %[[ARG10]]]
+//  CHECK-DAG:   %[[I4:.+]] = affine.apply #[[MAP]]()[%[[ARG5]], %[[ARG16]], %[[ARG11]]]
 //      CHECK:   memref.load %[[ARG0]][%[[I0]], %[[ARG2]], %[[I2]], %[[I3]], %[[I4]], %[[ARG6]]]
 
 // -----
diff --git a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
new file mode 100644
index 0000000000000..93a0d77bc698f
--- /dev/null
+++ b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops.mlir
@@ -0,0 +1,262 @@
+// RUN: mlir-opt -fold-tensor-subset-ops -split-input-file %s | FileCheck %s
+
+func.func @fold_vector_transfer_read_with_rank_reduced_extract_slice(
+    %arg0 : tensor<?x?x?xf32>,
+    %arg1: index, %arg2 : index, %arg3 : index, %arg4: index, %arg5 : index,
+    %arg6 : index) -> vector<4xf32> {
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.extract_slice %arg0[0, %arg1, %arg2] [1, %arg3, %arg4] [1, 1, 1]
+      : tensor<?x?x?xf32> to
+        tensor<?x?xf32>
+  %1 = vector.transfer_read %0[%arg5, %arg6], %cst {in_bounds = [true]}
+      : tensor<?x?xf32>, vector<4xf32>
+  return %1 : vector<4xf32>
+}
+//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+//       CHECK: func @fold_vector_transfer_read_with_rank_reduced_extract_slice
+//  CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?x?xf32>
+//  CHECK-SAME:    %[[ARG1:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG2:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG3:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG4:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG5:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG6:[a-zA-Z0-9]+]]: index
+//   CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:    %[[IDX0:.+]] = affine.apply #[[$MAP1]]()[%[[ARG1]], %[[ARG5]]]
+//   CHECK-DAG:    %[[IDX1:.+]] = affine.apply #[[$MAP1]]()[%[[ARG2]], %[[ARG6]]]
+//       CHECK:    vector.transfer_read %[[ARG0]][%[[C0]], %[[IDX0]], %[[IDX1]]], %{{.*}} : tensor<?x?x?xf32
+
+// -----
+
+// CHECK-LABEL: func.func @transfer_read_from_rank_reducing_extract_slice_failure
+func.func @transfer_read_from_rank_reducing_extract_slice_failure(
+    %src: tensor<1x8x8x8xf32>,
+    %i1: index, %i2: index, %i3: index, %i4: index) -> vector<4xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %f0 = arith.constant 0.000000e+00 : f32
+
+  // Can't fold this atm since we don' emit the proper vector.extract_strided_slice.
+//   CHECK: tensor.extract_slice
+  %0 = tensor.extract_slice %src[0, %i1, %i2, %i3] [1, 4, 1, 4] [2, 3, 4, 5] : tensor<1x8x8x8xf32> to tensor<1x4x4xf32>
+  %1 = vector.transfer_read %0[%c1, %i4, %c2], %f0 {in_bounds = [true]} : tensor<1x4x4xf32>, vector<4xf32>
+  return %1 : vector<4xf32>
+}
+
+// -----
+
+//   CHECK-DAG: #[[$ADD_4:.+]] = affine_map<()[s0] -> (s0 + 4)>
+
+// CHECK-LABEL: func @transfer_read_of_extract_slice(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
+//   CHECK-DAG:   %[[c8:.*]] = arith.constant 8 : index
+//       CHECK:   %[[add:.*]] = affine.apply #[[$ADD_4]]()[%[[s1]]]
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<5x6xf32>
+//       CHECK:   return %[[r]]
+func.func @transfer_read_of_extract_slice(%t : tensor<?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor<?x?xf32> to tensor<10x?xf32>
+  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<10x?xf32>, vector<5x6xf32>
+  return %1 : vector<5x6xf32>
+}
+// -----
+
+func.func @fold_extract_slice_with_transfer_read_0d(
+  %arg0 : tensor<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index)
+    -> vector<f32> {
+  %f1 = arith.constant 1.0 : f32
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2][1, 1][1, 1] : tensor<12x32xf32> to tensor<f32>
+  %1 = vector.transfer_read %0[], %f1 : tensor<f32>, vector<f32>
+  return %1 : vector<f32>
+}
+//      CHECK: func @fold_extract_slice_with_transfer_read_0d
+// CHECK-SAME:   %[[T:[a-zA-Z0-9_]+]]: tensor<12x32xf32>
+// CHECK-SAME:   %[[SZ0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:   %[[SZ1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:   %[[ST1:[a-zA-Z0-9_]+]]: index
+//      CHECK:   vector.transfer_read %[[T]][%[[SZ0]], %[[SZ1]]]
+
+// -----
+
+//   CHECK-DAG: #[[$ADD_4:.+]] = affine_map<()[s0] -> (s0 + 4)>
+
+// CHECK-LABEL: func @transfer_read_of_extract_slice(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
+//   CHECK-DAG:   %[[c8:.*]] = arith.constant 8 : index
+//       CHECK:   %[[add:.*]] = affine.apply #[[$ADD_4]]()[%[[s1]]]
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true]} : tensor<?x?xf32>, vector<6xf32>
+//       CHECK:   return %[[r]]
+func.func @transfer_read_of_extract_slice(%t : tensor<?x?xf32>, %s1 : index, %s2 : index) -> vector<6xf32> {
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor<?x?xf32> to tensor<10x?xf32>
+  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true]} : tensor<10x?xf32>, vector<6xf32>
+  return %1 : vector<6xf32>
+}
+
+// -----
+
+//   CHECK-DAG: #[[$ADD_3:.+]] = affine_map<()[s0] -> (s0 + 3)>
+
+// CHECK-LABEL: func @transfer_read_of_extract_slice_rank_reducing(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
+//   CHECK-DAG:   %[[c5:.*]] = arith.constant 5 : index
+//   CHECK-DAG:   %[[c10:.*]] = arith.constant 10 : index
+//       CHECK:   %[[add:.*]] = affine.apply #[[$ADD_3]]()[%[[s1]]]
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true]} : tensor<?x?x?xf32>, vector<5x6xf32>
+//       CHECK:   return %[[r]]
+func.func @transfer_read_of_extract_slice_rank_reducing(%t : tensor<?x?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.extract_slice %t[5, %s1, 6] [1, %s2, 12] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x12xf32>
+  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<?x12xf32>, vector<5x6xf32>
+  return %1 : vector<5x6xf32>
+}
+
+// -----
+
+//   CHECK-DAG: #[[$ADD_4:.+]] = affine_map<()[s0] -> (s0 + 4)>
+//   CHECK-DAG: #[[$d0d2:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+
+// CHECK-LABEL: func @transfer_read_of_extract_slice_swappy_rank_reducing(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
+func.func @transfer_read_of_extract_slice_swappy_rank_reducing(%t : tensor<?x?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %cst = arith.constant 0.0 : f32
+
+//   CHECK-NOT:   extract_slice
+//       CHECK:   %[[c8:.*]] = arith.constant 8 : index
+//       CHECK:   %[[add:.*]] = affine.apply #[[$ADD_4]]()[%[[s2]]]
+//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[s1]], %[[add]]]
+//  CHECK-SAME:     permutation_map = #[[$d0d2]]
+//  CHECK-SAME:     tensor<?x?x?xf32>, vector<5x6xf32>
+  %0 = tensor.extract_slice %t[5, %s1, %s2] [%s2, 1, 12] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x12xf32>
+  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<?x12xf32>, vector<5x6xf32>
+
+  return %1 : vector<5x6xf32>
+}
+
+// -----
+
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+
+//       CHECK: func @fold_vector_transfer_write_with_rank_reduced_insert_slice
+//  CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?x?xf32>
+//  CHECK-SAME:    %[[ARG1:[a-zA-Z0-9]+]]: vector<4xf32>
+//  CHECK-SAME:    %[[ARG2:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG3:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG4:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG5:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG6:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG7:[a-zA-Z0-9]+]]: index
+func.func @fold_vector_transfer_write_with_rank_reduced_insert_slice(
+    %arg0 : tensor<?x?x?xf32>,
+    %arg1 : vector<4xf32>, %arg2: index, %arg3 : index, %arg4 : index,
+    %arg5: index, %arg6 : index, %arg7 : index,
+    %st : tensor<?x?xf32>) -> tensor<?x?x?xf32> {
+  %cst = arith.constant 0.0 : f32
+
+//   CHECK-NOT:    insert_slice
+//   CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:    %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]]
+//   CHECK-DAG:    %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]]
+//   CHECK-DAG:    vector.transfer_write %[[ARG1]], %[[ARG0]][%[[C0]], %[[IDX0]], %[[IDX1]]] {in_bounds = [true]} : vector<4xf32>, tensor<?x?x?xf32
+  %0 = vector.transfer_write %arg1, %st[%arg6, %arg7] {in_bounds = [true]}
+      : vector<4xf32>, tensor<?x?xf32>
+  %1 = tensor.insert_slice %0 into %arg0[0, %arg2, %arg3] [1, %arg4, %arg5] [1, 1, 1]
+      : tensor<?x?xf32> into tensor<?x?x?xf32>
+  return %1 : tensor<?x?x?xf32>
+}
+
+// -----
+
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+//   CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d1)>
+
+//       CHECK: func @fold_vector_transfer_write_with_inner_rank_reduced_insert_slice
+//  CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?x?xf32>
+//  CHECK-SAME:    %[[ARG1:[a-zA-Z0-9]+]]: vector<4xf32>
+//  CHECK-SAME:    %[[ARG2:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG3:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG4:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG5:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG6:[a-zA-Z0-9]+]]: index
+//  CHECK-SAME:    %[[ARG7:[a-zA-Z0-9]+]]: index
+func.func @fold_vector_transfer_write_with_inner_rank_reduced_insert_slice(
+    %arg0 : tensor<?x?x?xf32>,
+    %arg1 : vector<4xf32>, %arg2: index, %arg3 : index, %arg4 : index,
+    %arg5: index, %arg6 : index, %arg7 : index,
+    %st : tensor<?x?xf32>) -> tensor<?x?x?xf32> {
+  %cst = arith.constant 0.0 : f32
+
+  //   CHECK-NOT: insert_slice
+  //   CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+  //   CHECK-DAG:  %[[IDX0:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG6]]]
+  //   CHECK-DAG:  %[[IDX1:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG7]]]
+  //   CHECK-DAG:  vector.transfer_write %[[ARG1]], %[[ARG0]][%[[IDX0]], %[[IDX1]], %[[C0]]]
+  //  CHECK-SAME:    {in_bounds = [true], permutation_map = #[[MAP2]]} : vector<4xf32>, tensor<?x?x?xf32
+  %0 = vector.transfer_write %arg1, %st[%arg6, %arg7] {in_bounds = [true]}
+      : vector<4xf32>, tensor<?x?xf32>
+  %1 = tensor.insert_slice %0 into %arg0[%arg2, %arg3, 0] [%arg4, %arg5, 1] [1, 1, 1]
+      : tensor<?x?xf32> into tensor<?x?x?xf32>
+  return %1 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_slice_of_transfer_write(
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
+func.func @insert_slice_of_transfer_write(%t1 : tensor<?x12xf32>, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x12xf32> {
+  %c0 = arith.constant 0 : index
+
+  //   CHECK-NOT: insert_slice
+//       CHECK:   %[[c3:.*]] = arith.constant 3 : index
+//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<?x12xf32>
+//       CHECK:   return %[[r]]
+  %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32>
+  %1 = tensor.insert_slice %0 into %t1[3, %s] [5, 6] [1, 1] : tensor<5x6xf32> into tensor<?x12xf32>
+  return %1 : tensor<?x12xf32>
+}
+
+// -----
+
+//   CHECK-DAG: #[[$d0d2:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+
+// CHECK-LABEL: func @insert_slice_of_transfer_write_swappy_rank_extending(
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?x?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
+func.func @insert_slice_of_transfer_write_swappy_rank_extending(
+    %t1 : tensor<?x?x12xf32>, %v : vector<5x6xf32>, 
+    %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x?x12xf32> {
+  %c0 = arith.constant 0 : index
+
+//   CHECK-NOT:   insert_slice
+//   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
+//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]]
+//  CHECK-SAME:    {in_bounds = [true, true], permutation_map = #[[$d0d2]]} : vector<5x6xf32>, tensor<?x?x12xf32>
+//       CHECK:   return %[[r]]
+  %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32>
+  %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [5, 1, 6] [1, 1, 1] : tensor<5x6xf32> into tensor<?x?x12xf32>
+  return %1 : tensor<?x?x12xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_slice_of_transfer_write_rank_extending(
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?x?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
+//   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
+//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<?x?x12xf32>
+//       CHECK:   return %[[r]]
+func.func @insert_slice_of_transfer_write_rank_extending(%t1 : tensor<?x?x12xf32>, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x?x12xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32>
+  %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [1, 5, 6] [1, 1, 1] : tensor<5x6xf32> into tensor<?x?x12xf32>
+  return %1 : tensor<?x?x12xf32>
+}
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 4071d92641839..8538c3db59dcd 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5607,6 +5607,7 @@ cc_library(
     deps = [
         ":AffineDialect",
         ":ArithDialect",
+        ":ArithUtils",
         ":DialectUtils",
         ":TensorDialect",
         "//llvm:Support",
@@ -5663,6 +5664,7 @@ cc_library(
         ":TensorPassIncGen",
         ":TilingInterface",
         ":Transforms",
+        ":VectorDialect",
         "//llvm:Support",
     ],
 )

From da44224d3aa806be8b1c4a29c791387e151b4f23 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Thu, 23 Mar 2023 11:17:40 +0000
Subject: [PATCH 099/208] [RISCV][test] Fix broken unit test after d25751779ba

The patch had missed the RISCVISAInfoTest.cpp change.
---
 llvm/unittests/Support/RISCVISAInfoTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index 2f623a94ce2a8..0b749eb0c6815 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -141,7 +141,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   RISCVISAInfo &InfoRV32E = **MaybeRV32E;
   RISCVISAInfo::OrderedExtensionMap ExtsRV32E = InfoRV32E.getExtensions();
   EXPECT_EQ(ExtsRV32E.size(), 1UL);
-  EXPECT_TRUE(ExtsRV32E.at("e") == (RISCVExtensionInfo{1, 9}));
+  EXPECT_TRUE(ExtsRV32E.at("e") == (RISCVExtensionInfo{2, 0}));
   EXPECT_EQ(InfoRV32E.getXLen(), 32U);
   EXPECT_EQ(InfoRV32E.getFLen(), 0U);
 

From 80db8b03a9ad39594277462e302b0b33e5d8c8dd Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 23 Mar 2023 11:20:19 +0000
Subject: [PATCH 100/208] [gn build] Port 48f97e575137

---
 .../utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn | 1 +
 .../gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
index 2f9db59141183..cfd54004a9ff3 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
@@ -10,6 +10,7 @@ static_library("FlowSensitive") {
     "DataflowAnalysisContext.cpp",
     "DataflowEnvironment.cpp",
     "DebugSupport.cpp",
+    "Logger.cpp",
     "Transfer.cpp",
     "TypeErasedDataflowAnalysis.cpp",
     "Value.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index a3a3966fed26b..22eb6721272ba 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -23,6 +23,7 @@ unittest("ClangAnalysisFlowSensitiveTests") {
     "DataflowAnalysisContextTest.cpp",
     "DataflowEnvironmentTest.cpp",
     "DebugSupportTest.cpp",
+    "LoggerTest.cpp",
     "MapLatticeTest.cpp",
     "MatchSwitchTest.cpp",
     "MultiVarConstantPropagationTest.cpp",

From 20725d34b29ff2881ecc1d2a5b8a796e2996c313 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 23 Mar 2023 11:20:20 +0000
Subject: [PATCH 101/208] [gn build] Port c2de8ff92753

---
 .../gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn      | 2 ++
 .../secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn   | 1 +
 2 files changed, 3 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
index de4074c3bfb64..edd0f2a3539d6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
@@ -25,6 +25,7 @@ static_library("JITLink") {
     "EHFrameSupport.cpp",
     "ELF.cpp",
     "ELFLinkGraphBuilder.cpp",
+    "ELF_aarch32.cpp",
     "ELF_aarch64.cpp",
     "ELF_i386.cpp",
     "ELF_loongarch.cpp",
@@ -37,6 +38,7 @@ static_library("JITLink") {
     "MachOLinkGraphBuilder.cpp",
     "MachO_arm64.cpp",
     "MachO_x86_64.cpp",
+    "aarch32.cpp",
     "aarch64.cpp",
     "i386.cpp",
     "loongarch.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
index d0d5225c9d6c9..d0f99ce939cfe 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
@@ -14,6 +14,7 @@ unittest("JITLinkTests") {
     "//llvm/lib/Testing/Support",
   ]
   sources = [
+    "AArch32Tests.cpp",
     "EHFrameSupportTests.cpp",
     "LinkGraphTests.cpp",
   ]

From f44c7dec67fee8a41450c8a46d9e944f88f82eb6 Mon Sep 17 00:00:00 2001
From: luxufan <luxufan@iscas.ac.cn>
Date: Thu, 23 Mar 2023 19:31:29 +0800
Subject: [PATCH 102/208] [Local] Use most generic range if K does not dominate
 J or K doesn't have a !noundef

Since D141386 has changed the return value of !range from IUB to poison,
metadata !range shouldn't be preserved even if K dominates J.

If this patch was accepted, I plan to adjust metadata !nonnull as well.
BTW, I found that metadata !noundef is not handled in combineMetadata,
is this intentional?

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D142687
---
 llvm/lib/Transforms/Utils/Local.cpp           |  9 +---
 llvm/test/Transforms/GVN/range.ll             | 50 +++++++++++++++----
 .../Transforms/JumpThreading/thread-loads.ll  |  5 +-
 llvm/test/Transforms/NewGVN/range.ll          | 23 +++++----
 4 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 5c1fd6e9ae4a4..31deb08d45989 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2675,14 +2675,7 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
                        intersectAccessGroups(K, J));
         break;
       case LLVMContext::MD_range:
-
-        // If K does move, use most generic range. Otherwise keep the range of
-        // K.
-        if (DoesKMove)
-          // FIXME: If K does move, we should drop the range info and nonnull.
-          //        Currently this function is used with DoesKMove in passes
-          //        doing hoisting/sinking and the current behavior of using the
-          //        most generic range is correct in those cases.
+        if (DoesKMove || !K->hasMetadata(LLVMContext::MD_noundef))
           K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
         break;
       case LLVMContext::MD_fpmath:
diff --git a/llvm/test/Transforms/GVN/range.ll b/llvm/test/Transforms/GVN/range.ll
index ae0801ee59da1..48605aef0fe7d 100644
--- a/llvm/test/Transforms/GVN/range.ll
+++ b/llvm/test/Transforms/GVN/range.ll
@@ -17,7 +17,7 @@ define i32 @test1(ptr %p) {
 define i32 @test2(ptr %p) {
 ; CHECK-LABEL: define i32 @test2
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -30,7 +30,7 @@ define i32 @test2(ptr %p) {
 define i32 @test3(ptr %p) {
 ; CHECK-LABEL: define i32 @test3
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG1:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -43,7 +43,7 @@ define i32 @test3(ptr %p) {
 define i32 @test4(ptr %p) {
 ; CHECK-LABEL: define i32 @test4
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG2:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -56,7 +56,7 @@ define i32 @test4(ptr %p) {
 define i32 @test5(ptr %p) {
 ; CHECK-LABEL: define i32 @test5
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG3:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -69,7 +69,7 @@ define i32 @test5(ptr %p) {
 define i32 @test6(ptr %p) {
 ; CHECK-LABEL: define i32 @test6
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG4:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -82,7 +82,7 @@ define i32 @test6(ptr %p) {
 define i32 @test7(ptr %p) {
 ; CHECK-LABEL: define i32 @test7
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG5:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -95,7 +95,7 @@ define i32 @test7(ptr %p) {
 define i32 @test8(ptr %p) {
 ; CHECK-LABEL: define i32 @test8
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG4:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -105,6 +105,31 @@ define i32 @test8(ptr %p) {
   ret i32 %c
 }
 
+define i32 @load_noundef_load(ptr %p) {
+; CHECK-LABEL: define i32 @load_noundef_load
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]], !noundef !6
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %a = load i32, ptr %p, !range !0, !noundef !11
+  %b = load i32, ptr %p, !range !1
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @load_load_noundef(ptr %p) {
+; CHECK-LABEL: define i32 @load_load_noundef
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG1]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %a = load i32, ptr %p, !range !0
+  %b = load i32, ptr %p, !range !1, !noundef !11
+  %c = add i32 %a, %b
+  ret i32 %c
+}
 
 !0 = !{i32 0, i32 2}
 !1 = !{i32 3, i32 5}
@@ -117,10 +142,13 @@ define i32 @test8(ptr %p) {
 !8 = !{i32 5, i32 1}
 !9 = !{i32 1, i32 5}
 !10 = !{i32 5, i32 1}
+!11 = !{}
 ;.
 ; CHECK: [[RNG0]] = !{i32 0, i32 2}
-; CHECK: [[RNG1]] = !{i32 -5, i32 -2}
-; CHECK: [[RNG2]] = !{i32 10, i32 1}
-; CHECK: [[RNG3]] = !{i32 1, i32 2, i32 3, i32 4}
-; CHECK: [[RNG4]] = !{i32 1, i32 5}
+; CHECK: [[RNG1]] = !{i32 0, i32 2, i32 3, i32 5}
+; CHECK: [[RNG2]] = !{i32 0, i32 5}
+; CHECK: [[RNG3]] = !{i32 -5, i32 -2, i32 1, i32 5}
+; CHECK: [[RNG4]] = !{i32 10, i32 1}
+; CHECK: [[RNG5]] = !{i32 3, i32 4, i32 5, i32 2}
+; CHECK: [[META6:![0-9]+]] = !{}
 ;.
diff --git a/llvm/test/Transforms/JumpThreading/thread-loads.ll b/llvm/test/Transforms/JumpThreading/thread-loads.ll
index a730be9492c80..85952e8e6db4f 100644
--- a/llvm/test/Transforms/JumpThreading/thread-loads.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-loads.ll
@@ -322,12 +322,12 @@ bb3:
 define void @test8(ptr, ptr, ptr) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  ret2:
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0:%.*]], align 4, !range [[RNG4:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0:%.*]], align 4, !range [[RNG4:![0-9]+]], !noundef !5
 ; CHECK-NEXT:    store i32 [[A]], ptr [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    [[XXX:%.*]] = tail call i32 (...) @f1() #[[ATTR0]]
 ; CHECK-NEXT:    ret void
 ;
-  %a = load i32, ptr %0, !tbaa !0, !range !4, !alias.scope !9, !noalias !10
+  %a = load i32, ptr %0, !tbaa !0, !range !4, !alias.scope !9, !noalias !10, !noundef !11
   %b = load i32, ptr %0, !range !5
   store i32 %a, ptr %1
   %c = icmp eq i32 %b, 8
@@ -693,3 +693,4 @@ right_x:
 !8 = !{!8, !6}
 !9 = !{!7}
 !10 = !{!8}
+!11 = !{}
diff --git a/llvm/test/Transforms/NewGVN/range.ll b/llvm/test/Transforms/NewGVN/range.ll
index 8803737c7bd4a..c853693897ab3 100644
--- a/llvm/test/Transforms/NewGVN/range.ll
+++ b/llvm/test/Transforms/NewGVN/range.ll
@@ -17,7 +17,7 @@ define i32 @test1(ptr %p) {
 define i32 @test2(ptr %p) {
 ; CHECK-LABEL: define i32 @test2
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -30,7 +30,7 @@ define i32 @test2(ptr %p) {
 define i32 @test3(ptr %p) {
 ; CHECK-LABEL: define i32 @test3
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG1:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -43,7 +43,7 @@ define i32 @test3(ptr %p) {
 define i32 @test4(ptr %p) {
 ; CHECK-LABEL: define i32 @test4
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG2:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -56,7 +56,7 @@ define i32 @test4(ptr %p) {
 define i32 @test5(ptr %p) {
 ; CHECK-LABEL: define i32 @test5
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG3:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -69,7 +69,7 @@ define i32 @test5(ptr %p) {
 define i32 @test6(ptr %p) {
 ; CHECK-LABEL: define i32 @test6
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG4:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -82,7 +82,7 @@ define i32 @test6(ptr %p) {
 define i32 @test7(ptr %p) {
 ; CHECK-LABEL: define i32 @test7
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG5:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -95,7 +95,7 @@ define i32 @test7(ptr %p) {
 define i32 @test8(ptr %p) {
 ; CHECK-LABEL: define i32 @test8
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG4:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -119,8 +119,9 @@ define i32 @test8(ptr %p) {
 !10 = !{i32 5, i32 1}
 ;.
 ; CHECK: [[RNG0]] = !{i32 0, i32 2}
-; CHECK: [[RNG1]] = !{i32 -5, i32 -2}
-; CHECK: [[RNG2]] = !{i32 10, i32 1}
-; CHECK: [[RNG3]] = !{i32 1, i32 2, i32 3, i32 4}
-; CHECK: [[RNG4]] = !{i32 1, i32 5}
+; CHECK: [[RNG1]] = !{i32 0, i32 2, i32 3, i32 5}
+; CHECK: [[RNG2]] = !{i32 0, i32 5}
+; CHECK: [[RNG3]] = !{i32 -5, i32 -2, i32 1, i32 5}
+; CHECK: [[RNG4]] = !{i32 10, i32 1}
+; CHECK: [[RNG5]] = !{i32 3, i32 4, i32 5, i32 2}
 ;.

From 67d828fb2c0168e1fe0f1caeba8fc7dc47b0c3ff Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 23 Mar 2023 13:18:14 +0200
Subject: [PATCH 103/208] [Object][NFC] Factor out computeHeadersSize.

In preparation for COFF archives support.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D143537
---
 llvm/lib/Object/ArchiveWriter.cpp | 64 +++++++++++++++++++------------
 1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 0d3aad658fe43..e2c97417d0789 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -356,7 +356,7 @@ static void printNBits(raw_ostream &Out, object::Archive::Kind Kind,
 
 static uint64_t computeSymbolTableSize(object::Archive::Kind Kind,
                                        uint64_t NumSyms, uint64_t OffsetSize,
-                                       StringRef StringTable,
+                                       uint64_t StringTableSize,
                                        uint32_t *Padding = nullptr) {
   assert((OffsetSize == 4 || OffsetSize == 8) && "Unsupported OffsetSize");
   uint64_t Size = OffsetSize; // Number of entries
@@ -366,7 +366,7 @@ static uint64_t computeSymbolTableSize(object::Archive::Kind Kind,
     Size += NumSyms * OffsetSize; // Table
   if (isBSDLike(Kind))
     Size += OffsetSize; // byte count
-  Size += StringTable.size();
+  Size += StringTableSize;
   // ld64 expects the members to be 8-byte aligned for 64-bit content and at
   // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
   // uniformly.
@@ -398,9 +398,24 @@ static void writeSymbolTableHeader(raw_ostream &Out, object::Archive::Kind Kind,
   }
 }
 
+static uint64_t computeHeadersSize(object::Archive::Kind Kind, uint64_t NumSyms,
+                                   uint64_t SymNamesSize) {
+  uint32_t OffsetSize = is64BitKind(Kind) ? 8 : 4;
+  uint64_t SymtabSize =
+      computeSymbolTableSize(Kind, NumSyms, OffsetSize, SymNamesSize);
+  auto computeSymbolTableHeaderSize = [=] {
+    SmallString<0> TmpBuf;
+    raw_svector_ostream Tmp(TmpBuf);
+    writeSymbolTableHeader(Tmp, Kind, true, SymtabSize);
+    return TmpBuf.size();
+  };
+
+  return strlen("!<arch>\n") + computeSymbolTableHeaderSize() + SymtabSize;
+}
+
 static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
                              bool Deterministic, ArrayRef<MemberData> Members,
-                             StringRef StringTable,
+                             StringRef StringTable, uint64_t MembersOffset,
                              uint64_t PrevMemberOffset = 0) {
   // We don't write a symbol table on an archive with no members -- except on
   // Darwin, where the linker will abort unless the archive has a symbol table.
@@ -413,17 +428,16 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
 
   uint64_t OffsetSize = is64BitKind(Kind) ? 8 : 4;
   uint32_t Pad;
-  uint64_t Size = computeSymbolTableSize(Kind, NumSyms, OffsetSize, StringTable, &Pad);
+  uint64_t Size = computeSymbolTableSize(Kind, NumSyms, OffsetSize,
+                                         StringTable.size(), &Pad);
   writeSymbolTableHeader(Out, Kind, Deterministic, Size, PrevMemberOffset);
 
-  uint64_t Pos = isAIXBigArchive(Kind) ? sizeof(object::BigArchive::FixLenHdr)
-                                       : Out.tell() + Size;
-
   if (isBSDLike(Kind))
     printNBits(Out, Kind, NumSyms * 2 * OffsetSize);
   else
     printNBits(Out, Kind, NumSyms);
 
+  uint64_t Pos = MembersOffset;
   for (const MemberData &M : Members) {
     for (unsigned StringOffset : M.Symbols) {
       if (isBSDLike(Kind))
@@ -679,9 +693,8 @@ static Error writeArchiveToStream(raw_ostream &Out,
     Data.insert(Data.begin(), computeStringTable(StringTableBuf));
 
   // We would like to detect if we need to switch to a 64-bit symbol table.
-  uint64_t LastMemberEndOffset =
-      isAIXBigArchive(Kind) ? sizeof(object::BigArchive::FixLenHdr) : 8;
-  uint64_t LastMemberHeaderOffset = LastMemberEndOffset;
+  uint64_t LastMemberEndOffset = 0;
+  uint64_t LastMemberHeaderOffset = 0;
   uint64_t NumSyms = 0;
   for (const auto &M : Data) {
     // Record the start of the member's offset
@@ -691,19 +704,13 @@ static Error writeArchiveToStream(raw_ostream &Out,
     NumSyms += M.Symbols.size();
   }
 
+  std::optional<uint64_t> HeadersSize;
+
   // The symbol table is put at the end of the big archive file. The symbol
   // table is at the start of the archive file for other archive formats.
-  if (WriteSymtab && !isAIXBigArchive(Kind)) {
+  if (WriteSymtab && !is64BitKind(Kind)) {
     // We assume 32-bit offsets to see if 32-bit symbols are possible or not.
-    uint64_t SymtabSize = computeSymbolTableSize(Kind, NumSyms, 4, SymNamesBuf);
-    auto computeSymbolTableHeaderSize =
-        [=] {
-          SmallString<0> TmpBuf;
-          raw_svector_ostream Tmp(TmpBuf);
-          writeSymbolTableHeader(Tmp, Kind, Deterministic, SymtabSize);
-          return TmpBuf.size();
-        };
-    LastMemberHeaderOffset += computeSymbolTableHeaderSize() + SymtabSize;
+    HeadersSize = computeHeadersSize(Kind, NumSyms, SymNamesBuf.size());
 
     // The SYM64 format is used when an archive's member offsets are larger than
     // 32-bits can hold. The need for this shift in format is detected by
@@ -720,11 +727,12 @@ static Error writeArchiveToStream(raw_ostream &Out,
     // If LastMemberHeaderOffset isn't going to fit in a 32-bit varible we need
     // to switch to 64-bit. Note that the file can be larger than 4GB as long as
     // the last member starts before the 4GB offset.
-    if (LastMemberHeaderOffset >= Sym64Threshold) {
+    if (*HeadersSize + LastMemberHeaderOffset >= Sym64Threshold) {
       if (Kind == object::Archive::K_DARWIN)
         Kind = object::Archive::K_DARWIN64;
       else
         Kind = object::Archive::K_GNU64;
+      HeadersSize.reset();
     }
   }
 
@@ -736,11 +744,19 @@ static Error writeArchiveToStream(raw_ostream &Out,
     Out << "!<arch>\n";
 
   if (!isAIXBigArchive(Kind)) {
-    if (WriteSymtab)
-      writeSymbolTable(Out, Kind, Deterministic, Data, SymNamesBuf);
+    if (WriteSymtab) {
+      if (!HeadersSize)
+        HeadersSize = computeHeadersSize(Kind, NumSyms, SymNamesBuf.size());
+      writeSymbolTable(Out, Kind, Deterministic, Data, SymNamesBuf,
+                       *HeadersSize);
+    }
     for (const MemberData &M : Data)
       Out << M.Header << M.Data << M.Padding;
   } else {
+    HeadersSize = sizeof(object::BigArchive::FixLenHdr);
+    LastMemberEndOffset += *HeadersSize;
+    LastMemberHeaderOffset += *HeadersSize;
+
     // For the big archive (AIX) format, compute a table of member names and
     // offsets, used in the member table.
     uint64_t MemberTableNameStrTblSize = 0;
@@ -813,7 +829,7 @@ static Error writeArchiveToStream(raw_ostream &Out,
 
       if (WriteSymtab && NumSyms > 0)
         writeSymbolTable(Out, Kind, Deterministic, Data, SymNamesBuf,
-                         LastMemberEndOffset);
+                         *HeadersSize, LastMemberEndOffset);
     }
   }
   Out.flush();

From 257dc54be2d292acec2d6b609ebb0a4e424c9e30 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 23 Mar 2023 13:19:53 +0200
Subject: [PATCH 104/208] [Object][NFC] Don't insert string table into object
 members vector.

Having string table in members vector does not fit later patches in
this series. Symbol map needs to refer to objects' offsets, but string
table should not be referenced. Also for ARM64EC, the new <ECSYMBOLS>
table is inserted after string table.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D143538
---
 llvm/lib/Object/ArchiveWriter.cpp | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index e2c97417d0789..cd0429a271227 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -398,7 +398,8 @@ static void writeSymbolTableHeader(raw_ostream &Out, object::Archive::Kind Kind,
   }
 }
 
-static uint64_t computeHeadersSize(object::Archive::Kind Kind, uint64_t NumSyms,
+static uint64_t computeHeadersSize(object::Archive::Kind Kind,
+                                   uint64_t StringMemberSize, uint64_t NumSyms,
                                    uint64_t SymNamesSize) {
   uint32_t OffsetSize = is64BitKind(Kind) ? 8 : 4;
   uint64_t SymtabSize =
@@ -410,7 +411,7 @@ static uint64_t computeHeadersSize(object::Archive::Kind Kind, uint64_t NumSyms,
     return TmpBuf.size();
   };
 
-  return strlen("!<arch>\n") + computeSymbolTableHeaderSize() + SymtabSize;
+  return strlen("!<arch>\n") + computeSymbolTableHeaderSize() + SymtabSize + StringMemberSize;
 }
 
 static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
@@ -689,8 +690,14 @@ static Error writeArchiveToStream(raw_ostream &Out,
     return E;
   std::vector<MemberData> &Data = *DataOrErr;
 
-  if (!StringTableBuf.empty() && !isAIXBigArchive(Kind))
-    Data.insert(Data.begin(), computeStringTable(StringTableBuf));
+  uint64_t StringTableSize = 0;
+  MemberData StringTableMember;
+  if (!StringTableBuf.empty() && !isAIXBigArchive(Kind)) {
+    StringTableMember = computeStringTable(StringTableBuf);
+    StringTableSize = StringTableMember.Header.size() +
+                      StringTableMember.Data.size() +
+                      StringTableMember.Padding.size();
+  }
 
   // We would like to detect if we need to switch to a 64-bit symbol table.
   uint64_t LastMemberEndOffset = 0;
@@ -710,7 +717,8 @@ static Error writeArchiveToStream(raw_ostream &Out,
   // table is at the start of the archive file for other archive formats.
   if (WriteSymtab && !is64BitKind(Kind)) {
     // We assume 32-bit offsets to see if 32-bit symbols are possible or not.
-    HeadersSize = computeHeadersSize(Kind, NumSyms, SymNamesBuf.size());
+    HeadersSize =
+        computeHeadersSize(Kind, StringTableSize, NumSyms, SymNamesBuf.size());
 
     // The SYM64 format is used when an archive's member offsets are larger than
     // 32-bits can hold. The need for this shift in format is detected by
@@ -746,10 +754,16 @@ static Error writeArchiveToStream(raw_ostream &Out,
   if (!isAIXBigArchive(Kind)) {
     if (WriteSymtab) {
       if (!HeadersSize)
-        HeadersSize = computeHeadersSize(Kind, NumSyms, SymNamesBuf.size());
+        HeadersSize = computeHeadersSize(Kind, StringTableSize, NumSyms,
+                                         SymNamesBuf.size());
       writeSymbolTable(Out, Kind, Deterministic, Data, SymNamesBuf,
                        *HeadersSize);
     }
+
+    if (StringTableSize)
+      Out << StringTableMember.Header << StringTableMember.Data
+          << StringTableMember.Padding;
+
     for (const MemberData &M : Data)
       Out << M.Header << M.Data << M.Padding;
   } else {

From 4fcbf3842007569880fa916831efefda6b1bd032 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 23 Mar 2023 13:20:15 +0200
Subject: [PATCH 105/208] [llvm-lib] Use COFF archive format in llvm-lib (other
 archive tools don't use this format).

We currently just use GNU format for llvm-lib. This mostly works, but
ARM64EC needs an additional section that does not really fit GNU format.
This patch implements writing in COFF format (as in, it's what archive
reader considers as K_COFF). This mostly requires symbol emitting symbol
map. Note that, just like in case of MSVC, symbols are de-duplicated in
both usual symbol table and the new symbol map.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D143540
---
 llvm/lib/Object/ArchiveWriter.cpp           | 135 ++++++++++++++++----
 llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp |   6 +-
 llvm/test/tools/llvm-lib/duplicate.test     |   9 ++
 3 files changed, 126 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index cd0429a271227..2d0f92e43a344 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -17,6 +17,7 @@
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/MachO.h"
@@ -43,6 +44,10 @@
 
 using namespace llvm;
 
+struct SymMap {
+  std::map<std::string, uint16_t> Map;
+};
+
 NewArchiveMember::NewArchiveMember(MemoryBufferRef BufRef)
     : Buf(MemoryBuffer::getMemBuffer(BufRef, false)),
       MemberName(BufRef.getBufferIdentifier()) {}
@@ -169,18 +174,21 @@ static bool isAIXBigArchive(object::Archive::Kind Kind) {
   return Kind == object::Archive::K_AIXBIG;
 }
 
+static bool isCOFFArchive(object::Archive::Kind Kind) {
+  return Kind == object::Archive::K_COFF;
+}
+
 static bool isBSDLike(object::Archive::Kind Kind) {
   switch (Kind) {
   case object::Archive::K_GNU:
   case object::Archive::K_GNU64:
   case object::Archive::K_AIXBIG:
+  case object::Archive::K_COFF:
     return false;
   case object::Archive::K_BSD:
   case object::Archive::K_DARWIN:
   case object::Archive::K_DARWIN64:
     return true;
-  case object::Archive::K_COFF:
-    break;
   }
   llvm_unreachable("not supported for writting");
 }
@@ -191,6 +199,10 @@ static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val) {
                          isBSDLike(Kind) ? support::little : support::big);
 }
 
+template <class T> static void printLE(raw_ostream &Out, T Val) {
+  support::endian::write(Out, Val, support::little);
+}
+
 static void printRestOfMemberHeader(
     raw_ostream &Out, const sys::TimePoint<std::chrono::seconds> &ModTime,
     unsigned UID, unsigned GID, unsigned Perms, uint64_t Size) {
@@ -295,7 +307,11 @@ printMemberHeader(raw_ostream &Out, uint64_t Pos, raw_ostream &StringTable,
     auto Insertion = MemberNames.insert({M.MemberName, uint64_t(0)});
     if (Insertion.second) {
       Insertion.first->second = StringTable.tell();
-      StringTable << M.MemberName << "/\n";
+      StringTable << M.MemberName;
+      if (isCOFFArchive(Kind))
+        StringTable << '\0';
+      else
+        StringTable << "/\n";
     }
     NamePos = Insertion.first->second;
   }
@@ -376,6 +392,22 @@ static uint64_t computeSymbolTableSize(object::Archive::Kind Kind,
   uint32_t Pad = isAIXBigArchive(Kind)
                      ? 0
                      : offsetToAlignment(Size, Align(isBSDLike(Kind) ? 8 : 2));
+
+  Size += Pad;
+  if (Padding)
+    *Padding = Pad;
+  return Size;
+}
+
+static uint64_t computeSymbolMapSize(uint64_t NumObj, SymMap &SymMap,
+                                     uint32_t *Padding = nullptr) {
+  uint64_t Size = sizeof(uint32_t) * 2; // Number of symbols and objects entries
+  Size += NumObj * sizeof(uint32_t);    // Offset table
+
+  for (auto S : SymMap.Map)
+    Size += sizeof(uint16_t) + S.first.length() + 1;
+
+  uint32_t Pad = offsetToAlignment(Size, Align(2));
   Size += Pad;
   if (Padding)
     *Padding = Pad;
@@ -399,8 +431,9 @@ static void writeSymbolTableHeader(raw_ostream &Out, object::Archive::Kind Kind,
 }
 
 static uint64_t computeHeadersSize(object::Archive::Kind Kind,
+                                   uint64_t NumMembers,
                                    uint64_t StringMemberSize, uint64_t NumSyms,
-                                   uint64_t SymNamesSize) {
+                                   uint64_t SymNamesSize, SymMap *SymMap) {
   uint32_t OffsetSize = is64BitKind(Kind) ? 8 : 4;
   uint64_t SymtabSize =
       computeSymbolTableSize(Kind, NumSyms, OffsetSize, SymNamesSize);
@@ -410,8 +443,13 @@ static uint64_t computeHeadersSize(object::Archive::Kind Kind,
     writeSymbolTableHeader(Tmp, Kind, true, SymtabSize);
     return TmpBuf.size();
   };
+  uint32_t HeaderSize = computeSymbolTableHeaderSize();
+  uint64_t Size = strlen("!<arch>\n") + HeaderSize + SymtabSize;
+
+  if (SymMap)
+    Size += HeaderSize + computeSymbolMapSize(NumMembers, *SymMap);
 
-  return strlen("!<arch>\n") + computeSymbolTableHeaderSize() + SymtabSize + StringMemberSize;
+  return Size + StringMemberSize;
 }
 
 static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
@@ -420,7 +458,7 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
                              uint64_t PrevMemberOffset = 0) {
   // We don't write a symbol table on an archive with no members -- except on
   // Darwin, where the linker will abort unless the archive has a symbol table.
-  if (StringTable.empty() && !isDarwin(Kind))
+  if (StringTable.empty() && !isDarwin(Kind) && !isCOFFArchive(Kind))
     return;
 
   unsigned NumSyms = 0;
@@ -457,8 +495,35 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
     Out.write(uint8_t(0));
 }
 
+static void writeSymbolMap(raw_ostream &Out, object::Archive::Kind Kind,
+                           bool Deterministic, ArrayRef<MemberData> Members,
+                           SymMap &SymMap, uint64_t MembersOffset) {
+  uint32_t Pad;
+  uint64_t Size = computeSymbolMapSize(Members.size(), SymMap, &Pad);
+  writeSymbolTableHeader(Out, Kind, Deterministic, Size, 0);
+
+  uint32_t Pos = MembersOffset;
+
+  printLE<uint32_t>(Out, Members.size());
+  for (const MemberData &M : Members) {
+    printLE(Out, Pos); // member offset
+    Pos += M.Header.size() + M.Data.size() + M.Padding.size();
+  }
+
+  printLE<uint32_t>(Out, SymMap.Map.size());
+
+  for (auto S : SymMap.Map)
+    printLE(Out, S.second);
+  for (auto S : SymMap.Map)
+    Out << S.first << '\0';
+
+  while (Pad--)
+    Out.write(uint8_t(0));
+}
+
 static Expected<std::vector<unsigned>>
-getSymbols(MemoryBufferRef Buf, raw_ostream &SymNames, bool &HasObject) {
+getSymbols(MemoryBufferRef Buf, uint16_t Index, raw_ostream &SymNames,
+           SymMap *SymMap, bool &HasObject) {
   std::vector<unsigned> Ret;
 
   // In the scenario when LLVMContext is populated SymbolicFile will contain a
@@ -487,10 +552,22 @@ getSymbols(MemoryBufferRef Buf, raw_ostream &SymNames, bool &HasObject) {
   for (const object::BasicSymbolRef &S : Obj->symbols()) {
     if (!isArchiveSymbol(S))
       continue;
-    Ret.push_back(SymNames.tell());
-    if (Error E = S.printName(SymNames))
-      return std::move(E);
-    SymNames << '\0';
+    if (SymMap) {
+      std::string Name;
+      raw_string_ostream NameStream(Name);
+      if (Error E = S.printName(NameStream))
+        return std::move(E);
+      if (SymMap->Map.find(Name) != SymMap->Map.end())
+        continue; // ignore duplicated symbol
+      SymMap->Map[Name] = Index;
+      Ret.push_back(SymNames.tell());
+      SymNames << Name << '\0';
+    } else {
+      Ret.push_back(SymNames.tell());
+      if (Error E = S.printName(SymNames))
+        return std::move(E);
+      SymNames << '\0';
+    }
   }
   return Ret;
 }
@@ -498,7 +575,8 @@ getSymbols(MemoryBufferRef Buf, raw_ostream &SymNames, bool &HasObject) {
 static Expected<std::vector<MemberData>>
 computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
                   object::Archive::Kind Kind, bool Thin, bool Deterministic,
-                  bool NeedSymbols, ArrayRef<NewArchiveMember> NewMembers) {
+                  bool NeedSymbols, SymMap *SymMap,
+                  ArrayRef<NewArchiveMember> NewMembers) {
   static char PaddingData[8] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
 
   uint64_t Pos =
@@ -564,7 +642,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
 
   // The big archive format needs to know the offset of the previous member
   // header.
-  unsigned PrevOffset = 0;
+  unsigned PrevOffset = 0, Index = 0;
   for (const NewArchiveMember &M : NewMembers) {
     std::string Header;
     raw_string_ostream Out(Header);
@@ -572,6 +650,8 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
     MemoryBufferRef Buf = M.Buf->getMemBufferRef();
     StringRef Data = Thin ? "" : Buf.getBuffer();
 
+    Index++;
+
     // ld64 expects the members to be 8-byte aligned for 64-bit content and at
     // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
     // uniformly.  This matches the behaviour with cctools and ensures that ld64
@@ -612,7 +692,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
     std::vector<unsigned> Symbols;
     if (NeedSymbols) {
       Expected<std::vector<unsigned>> SymbolsOrErr =
-          getSymbols(Buf, SymNames, HasObject);
+          getSymbols(Buf, Index, SymNames, SymMap, HasObject);
       if (!SymbolsOrErr)
         return createFileError(M.MemberName, SymbolsOrErr.takeError());
       Symbols = std::move(*SymbolsOrErr);
@@ -624,7 +704,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
   // If there are no symbols, emit an empty symbol table, to satisfy Solaris
   // tools, older versions of which expect a symbol table in a non-empty
   // archive, regardless of whether there are any symbols in it.
-  if (HasObject && SymNames.tell() == 0)
+  if (HasObject && SymNames.tell() == 0 && !isCOFFArchive(Kind))
     SymNames << '\0' << '\0' << '\0';
   return Ret;
 }
@@ -682,10 +762,16 @@ static Error writeArchiveToStream(raw_ostream &Out,
   raw_svector_ostream SymNames(SymNamesBuf);
   SmallString<0> StringTableBuf;
   raw_svector_ostream StringTable(StringTableBuf);
+  SymMap SymMap;
 
-  Expected<std::vector<MemberData>> DataOrErr =
-      computeMemberData(StringTable, SymNames, Kind, Thin, Deterministic,
-                        WriteSymtab, NewMembers);
+  // COFF symbol map uses 16-bit indexes, so we can't use it if there are too
+  // many members.
+  if (isCOFFArchive(Kind) && NewMembers.size() > 0xfffe)
+    Kind = object::Archive::K_GNU;
+
+  Expected<std::vector<MemberData>> DataOrErr = computeMemberData(
+      StringTable, SymNames, Kind, Thin, Deterministic, WriteSymtab,
+      isCOFFArchive(Kind) ? &SymMap : nullptr, NewMembers);
   if (Error E = DataOrErr.takeError())
     return E;
   std::vector<MemberData> &Data = *DataOrErr;
@@ -717,8 +803,9 @@ static Error writeArchiveToStream(raw_ostream &Out,
   // table is at the start of the archive file for other archive formats.
   if (WriteSymtab && !is64BitKind(Kind)) {
     // We assume 32-bit offsets to see if 32-bit symbols are possible or not.
-    HeadersSize =
-        computeHeadersSize(Kind, StringTableSize, NumSyms, SymNamesBuf.size());
+    HeadersSize = computeHeadersSize(Kind, Data.size(), StringTableSize,
+                                     NumSyms, SymNamesBuf.size(),
+                                     isCOFFArchive(Kind) ? &SymMap : nullptr);
 
     // The SYM64 format is used when an archive's member offsets are larger than
     // 32-bits can hold. The need for this shift in format is detected by
@@ -754,10 +841,14 @@ static Error writeArchiveToStream(raw_ostream &Out,
   if (!isAIXBigArchive(Kind)) {
     if (WriteSymtab) {
       if (!HeadersSize)
-        HeadersSize = computeHeadersSize(Kind, StringTableSize, NumSyms,
-                                         SymNamesBuf.size());
+        HeadersSize = computeHeadersSize(
+            Kind, Data.size(), StringTableSize, NumSyms, SymNamesBuf.size(),
+            isCOFFArchive(Kind) ? &SymMap : nullptr);
       writeSymbolTable(Out, Kind, Deterministic, Data, SymNamesBuf,
                        *HeadersSize);
+
+      if (isCOFFArchive(Kind))
+        writeSymbolMap(Out, Kind, Deterministic, Data, SymMap, *HeadersSize);
     }
 
     if (StringTableSize)
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 3a609eefcb10e..9ca63bead9bc7 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -455,10 +455,12 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
   // For compatibility with MSVC, reverse member vector after de-duplication.
   std::reverse(Members.begin(), Members.end());
 
+  bool Thin = Args.hasArg(OPT_llvmlibthin);
   if (Error E =
           writeArchive(OutputPath, Members,
-                       /*WriteSymtab=*/true, object::Archive::K_GNU,
-                       /*Deterministic*/ true, Args.hasArg(OPT_llvmlibthin))) {
+                       /*WriteSymtab=*/true,
+                       Thin ? object::Archive::K_GNU : object::Archive::K_COFF,
+                       /*Deterministic*/ true, Thin)) {
     handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
       llvm::errs() << OutputPath << ": " << EI.message() << "\n";
     });
diff --git a/llvm/test/tools/llvm-lib/duplicate.test b/llvm/test/tools/llvm-lib/duplicate.test
index 098858d4fbcd1..87dae66cb80be 100644
--- a/llvm/test/tools/llvm-lib/duplicate.test
+++ b/llvm/test/tools/llvm-lib/duplicate.test
@@ -14,3 +14,12 @@ CHECK: bar.o
 CHECK-NEXT: abc.o
 CHECK-NEXT: foo.o
 CHECK-NOT: foo.o
+
+# Check that symbol map contains sorted, de-duplicated symbols.
+RUN: cd %t && llvm-lib -out:foo.lib foo.o foo.o abc.o bar.o  foo.o foo.o
+RUN: llvm-nm --print-armap %t/foo.lib | FileCheck %s --check-prefix=DUP
+# DUP: Archive map
+# DUP-NEXT: a in abc.o
+# DUP-NEXT: b in bar.o
+# DUP-NEXT: c in abc.o
+# DUP-EMPTY

From a5988034a44d039f95db3067e4ad0dfeeca155c3 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 23 Mar 2023 13:20:37 +0200
Subject: [PATCH 106/208] [lld] Fill .text section gaps with INT3 only on x86
 targets.

It doesn't make sense on ARM and using default 0 fill is compatible
with MSVC.

(It's more noticeable ARM64EC targets, where additional padding mixed
with alignment is used for entry thunk association, so there are more
gaps).

Reviewed By: mstorsjo

Differential Revision: https://reviews.llvm.org/D145962
---
 lld/COFF/Writer.cpp                        |  3 +-
 lld/test/COFF/arm-thumb-thunks-multipass.s |  2 +-
 lld/test/COFF/arm64-import2.test           |  2 +-
 lld/test/COFF/gaps-fill.test               | 78 ++++++++++++++++++++++
 4 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 lld/test/COFF/gaps-fill.test

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 0909b14d81901..603703e65290b 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1953,7 +1953,8 @@ void Writer::writeSections() {
     // Fill gaps between functions in .text with INT3 instructions
     // instead of leaving as NUL bytes (which can be interpreted as
     // ADD instructions).
-    if (sec->header.Characteristics & IMAGE_SCN_CNT_CODE)
+    if ((sec->header.Characteristics & IMAGE_SCN_CNT_CODE) &&
+        (ctx.config.machine == AMD64 || ctx.config.machine == I386))
       memset(secBuf, 0xCC, sec->getRawSize());
     parallelForEach(sec->chunks, [&](Chunk *c) {
       c->writeTo(secBuf + c->getRVA() - sec->getRVA());
diff --git a/lld/test/COFF/arm-thumb-thunks-multipass.s b/lld/test/COFF/arm-thumb-thunks-multipass.s
index 71ce53d99b31f..c10b22963187b 100644
--- a/lld/test/COFF/arm-thumb-thunks-multipass.s
+++ b/lld/test/COFF/arm-thumb-thunks-multipass.s
@@ -67,4 +67,4 @@ far_func\i:
 // FUNC01-THUNKS: 40500a:       f2c0 0c10       movt    r12, #16
 // FUNC01-THUNKS: 40500e:       44e7            add     pc, r12
 // The instruction below is padding from the .balign
-// FUNC01-THUNKS: 405010:       cccc            ldm     r4!, {r2, r3, r6, r7}
+// FUNC01-THUNKS: 405010:       0000            movs    r0, r0
diff --git a/lld/test/COFF/arm64-import2.test b/lld/test/COFF/arm64-import2.test
index 9b95f1a29b834..342671211db87 100644
--- a/lld/test/COFF/arm64-import2.test
+++ b/lld/test/COFF/arm64-import2.test
@@ -18,7 +18,7 @@
 # AFTER:  140001000:      94000004        bl      0x140001010
 # AFTER:  140001004:      94000006        bl      0x14000101c
 # AFTER:  140001008:      d65f03c0        ret
-# AFTER:  14000100c:      ccccccff        <unknown>
+# AFTER:  14000100c:      000000ff
 # AFTER:  140001010:      b0000010        adrp    x16, 0x140002000
 # AFTER:  140001014:      f9403210        ldr     x16, [x16, #96]
 # AFTER:  140001018:      d61f0200        br      x16
diff --git a/lld/test/COFF/gaps-fill.test b/lld/test/COFF/gaps-fill.test
new file mode 100644
index 0000000000000..17cd9cbc86ab3
--- /dev/null
+++ b/lld/test/COFF/gaps-fill.test
@@ -0,0 +1,78 @@
+# REQUIRES: aarch64
+# RUN: split-file %s %t.dir
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-windows %t.dir/arm64-dllmain.s -o %t.dir/arm64-dllmain.obj
+# RUN: llvm-mc -filetype=obj -triple=aarch64-windows %t.dir/arm64-p4sym.s -o %t.dir/arm64-p4sym.obj
+# RUN: lld-link -dll -machine:arm64 %t.dir/arm64-dllmain.obj %t.dir/arm64-p4sym.obj -out:%t.dll
+
+# RUN: llvm-objdump -dz %t.dll | FileCheck -check-prefix=CHECK-ARM64 %s
+# CHECK-ARM64: 180001000: 52800020     mov     w0, #0x1
+# CHECK-ARM64: 180001004: d65f03c0     ret
+# CHECK-ARM64: 180001008: 00000000
+# CHECK-ARM64: 18000100c: 00000000
+# CHECK-ARM64: 180001010: 52800040     mov     w0, #0x2
+# CHECK-ARM64: 180001014: d65f03c0     ret
+
+#--- arm64-dllmain.s
+        .def _DllMainCRTStartup;
+        .scl 2;
+        .type 32;
+        .endef
+        .globl _DllMainCRTStartup
+        .p2align 2
+_DllMainCRTStartup:
+        mov w0, #1
+        ret
+
+#--- arm64-p4sym.s
+        .def p4sym;
+        .scl 2;
+        .type 32;
+        .endef
+        .globl p4sym
+        .p2align 4
+p4sym:
+        mov w0, #2
+        ret
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-windows %t.dir/x86_64-dllmain.s -o %t.dir/x86_64-dllmain.obj
+# RUN: llvm-mc -filetype=obj -triple=x86_64-windows %t.dir/x86_64-p4sym.s -o %t.dir/x86_64-p4sym.obj
+# RUN: lld-link -dll -machine:amd64 %t.dir/x86_64-dllmain.obj %t.dir/x86_64-p4sym.obj -out:%t.dll
+
+# RUN: llvm-objdump -dz %t.dll | FileCheck -check-prefix=CHECK-X64 %s
+# CHECK-X64: 180001000: b8 01 00 00 00 movl $0x1, %eax
+# CHECK-X64: 180001005: c3 retq
+# CHECK-X64: 180001006: cc int3
+# CHECK-X64: 180001007: cc int3
+# CHECK-X64: 180001008: cc int3
+# CHECK-X64: 180001009: cc int3
+# CHECK-X64: 18000100a: cc int3
+# CHECK-X64: 18000100b: cc int3
+# CHECK-X64: 18000100c: cc int3
+# CHECK-X64: 18000100d: cc int3
+# CHECK-X64: 18000100e: cc int3
+# CHECK-X64: 18000100f: cc int3
+# CHECK-X64: 180001010: b8 02 00 00 00 movl $0x2, %eax
+# CHECK-X64: 180001015: c3 retq
+
+#--- x86_64-dllmain.s
+        .def _DllMainCRTStartup;
+        .scl 2;
+        .type 32;
+        .endef
+        .globl _DllMainCRTStartup
+        .p2align 4, 0x90
+_DllMainCRTStartup:
+        movl $1, %eax
+        retq
+
+#--- x86_64-p4sym.s
+        .def p4sym;
+        .scl 2;
+        .type 32;
+        .endef
+        .globl p4sym
+        .p2align 4, 0x90
+p4sym:
+        movl $2, %eax
+        retq

From 0cbfd68af79fa4262e2f5f8939f94fccd439cb0b Mon Sep 17 00:00:00 2001
From: Quentin Colombet <quentin.colombet@gmail.com>
Date: Thu, 23 Mar 2023 12:43:34 +0100
Subject: [PATCH 107/208] =?UTF-8?q?[mlir]=20Fix=20call=20of=20overloaded?=
 =?UTF-8?q?=20=E2=80=98dropResults(<brace-enclosed=20initializer=20list>)?=
 =?UTF-8?q?=E2=80=99=20is=20ambiguous?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NFC
---
 mlir/include/mlir/IR/AffineMap.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index 75a268c483955..e21dc9c950c5a 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -249,7 +249,9 @@ class AffineMap {
 
   /// Returns a new AffineMap with the same number of dims and symbols and one
   /// less result at `pos`, dropped.
-  AffineMap dropResult(int64_t pos) const { return dropResults({pos}); }
+  AffineMap dropResult(int64_t pos) const {
+    return dropResults(ArrayRef({pos}));
+  }
 
   // Returns a new AffineMap with the same number of dims and symbols, but all
   // results in `positions` dropped.

From c2c9de4ae1251a6a9ee7eed6403bbb41a386bbcb Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 23 Mar 2023 12:56:33 +0100
Subject: [PATCH 108/208] [gn] port a282ea4898efe

---
 llvm/utils/gn/secondary/clang/test/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn
index 480e1cd5a89c4..c88db82c10192 100644
--- a/llvm/utils/gn/secondary/clang/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn
@@ -176,6 +176,7 @@ group("test") {
     "//llvm/tools/llvm-nm:symlinks",
     "//llvm/tools/llvm-objcopy:symlinks",
     "//llvm/tools/llvm-objdump:symlinks",
+    "//llvm/tools/llvm-pdbutil",
     "//llvm/tools/llvm-profdata",
     "//llvm/tools/llvm-rc:symlinks",
     "//llvm/tools/llvm-readobj:symlinks",

From 18d56880a89ad7d58f8543d148facebd079cef19 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Thu, 23 Mar 2023 08:05:15 -0400
Subject: [PATCH 109/208] Revert "libclang: Pass Clang install directory to
 driver via argv[0]."

This reverts commit 201fdef40dd6ec193d18d39638454a3c972f1fec.

There was an issue found in post-commit by:
https://lab.llvm.org/buildbot/#/builders/91/builds/15272
---
 clang/docs/ReleaseNotes.rst                     |  8 --------
 clang/include/clang-c/Index.h                   |  9 ++-------
 clang/test/Index/record-completion-invocation.c |  2 +-
 clang/test/Index/record-parsing-invocation.c    |  4 ++--
 clang/tools/libclang/CIndex.cpp                 | 11 +----------
 5 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 94e0f10a31743..005bf99a62457 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -368,14 +368,6 @@ libclang
   has an evaluable bit width. Fixes undefined behavior when called on a
   bit-field whose width depends on a template paramter.
 
-- ``clang_parseTranslationUnit`` and ``clang_parseTranslationUnit2`` have been
-  changed to automatically locate the Clang installation directory relative to
-  the location of the libclang binary and use it for system headers installed
-  alongside the Clang installation. It is no longer necessary to manually
-  locate such system headers or use the ``clang_parseTranslationUnit2FullArgv``
-  function for this purpose if libclang has been installed in the default
-  location.
- 
 Static Analyzer
 ---------------
 - Fix incorrect alignment attribute on the this parameter of certain
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 8275f2941a41c..c7d32e6a152ae 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -899,13 +899,8 @@ CINDEX_LINKAGE enum CXErrorCode clang_parseTranslationUnit2(
 
 /**
  * Same as clang_parseTranslationUnit2 but requires a full command line
- * for \c command_line_args including argv[0].
- *
- * This is useful if the driver uses paths relative to the binary and either
- * you are targeting libclang versions older than Clang 17, or libclang is
- * installed to a non-standard location. Clang 17 and newer will automatically
- * use the correct argv[0] if libclang is installed in the lib directory
- * parallel to the bin directory where the clang binary is installed.
+ * for \c command_line_args including argv[0]. This is useful if the standard
+ * library paths are relative to the binary.
  */
 CINDEX_LINKAGE enum CXErrorCode clang_parseTranslationUnit2FullArgv(
     CXIndex CIdx, const char *source_filename,
diff --git a/clang/test/Index/record-completion-invocation.c b/clang/test/Index/record-completion-invocation.c
index 75eb9083908ae..4b667134fa2d4 100644
--- a/clang/test/Index/record-completion-invocation.c
+++ b/clang/test/Index/record-completion-invocation.c
@@ -9,4 +9,4 @@
 // RUN: env LIBCLANG_DISABLE_CRASH_RECOVERY=1 CINDEXTEST_INVOCATION_EMISSION_PATH=%t not --crash c-index-test -code-completion-at=%s:10:1 "-remap-file=%s,%S/Inputs/record-parsing-invocation-remap.c" %s
 // RUN: cat %t/libclang-* | FileCheck %s
 
-// CHECK: {"toolchain":"{{.*}}","libclang.operation":"complete","libclang.opts":1,"args":["{{.*}}bin{{.*}}clang","-fno-spell-checking","{{.*}}record-completion-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"],"invocation-args":["-code-completion-at={{.*}}record-completion-invocation.c:10:1"],"unsaved_file_hashes":[{"name":"{{.*}}record-completion-invocation.c","md5":"aee23773de90e665992b48209351d70e"}]}
+// CHECK: {"toolchain":"{{.*}}","libclang.operation":"complete","libclang.opts":1,"args":["clang","-fno-spell-checking","{{.*}}record-completion-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"],"invocation-args":["-code-completion-at={{.*}}record-completion-invocation.c:10:1"],"unsaved_file_hashes":[{"name":"{{.*}}record-completion-invocation.c","md5":"aee23773de90e665992b48209351d70e"}]}
diff --git a/clang/test/Index/record-parsing-invocation.c b/clang/test/Index/record-parsing-invocation.c
index f370f014fb1cc..e0c4cdb05fb00 100644
--- a/clang/test/Index/record-parsing-invocation.c
+++ b/clang/test/Index/record-parsing-invocation.c
@@ -25,5 +25,5 @@
 #  pragma clang __debug parser_crash
 #endif
 
-// CHECK: {"toolchain":"{{.*}}","libclang.operation":"parse","libclang.opts":1,"args":["{{.*}}bin{{.*}}clang","-fno-spell-checking","{{.*}}record-parsing-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"]}
-// CHECK-UNSAVED: {"toolchain":"{{.*}}","libclang.operation":"parse","libclang.opts":1,"args":["{{.*}}bin{{.*}}clang","-fno-spell-checking","{{.*}}record-parsing-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"],"unsaved_file_hashes":[{"name":"{{.*}}record-parsing-invocation.c","md5":"aee23773de90e665992b48209351d70e"}]}
+// CHECK: {"toolchain":"{{.*}}","libclang.operation":"parse","libclang.opts":1,"args":["clang","-fno-spell-checking","{{.*}}record-parsing-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"]}
+// CHECK-UNSAVED: {"toolchain":"{{.*}}","libclang.operation":"parse","libclang.opts":1,"args":["clang","-fno-spell-checking","{{.*}}record-parsing-invocation.c","-Xclang","-detailed-preprocessing-record","-fallow-editor-placeholders"],"unsaved_file_hashes":[{"name":"{{.*}}record-parsing-invocation.c","md5":"aee23773de90e665992b48209351d70e"}]}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 2aa12667d37e9..30416e46ce173 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -4013,17 +4013,8 @@ enum CXErrorCode clang_parseTranslationUnit2(
     struct CXUnsavedFile *unsaved_files, unsigned num_unsaved_files,
     unsigned options, CXTranslationUnit *out_TU) {
   noteBottomOfStack();
-
-  if (!CIdx)
-    return CXError_InvalidArguments;
-
-  SmallString<64> ClangPath(
-      static_cast<CIndexer *>(CIdx)->getClangToolchainPath());
-  llvm::sys::path::append(ClangPath, "bin");
-  llvm::sys::path::append(ClangPath, "clang");
-
   SmallVector<const char *, 4> Args;
-  Args.push_back(ClangPath.c_str());
+  Args.push_back("clang");
   Args.append(command_line_args, command_line_args + num_command_line_args);
   return clang_parseTranslationUnit2FullArgv(
       CIdx, source_filename, Args.data(), Args.size(), unsaved_files,

From 43fcfdb1d6a63129ffbb7d77174ccb56863d0b30 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Mon, 20 Mar 2023 09:07:18 +0100
Subject: [PATCH 110/208] [IncludeCleaner][clangd] Mark umbrella headers as
 users of private

Private headers inside umbrella files shouldn't be marked as unused.

Differential Revision: https://reviews.llvm.org/D146406
---
 clang-tools-extra/clangd/IncludeCleaner.cpp   | 16 +++++++-
 .../clangd/unittests/IncludeCleanerTests.cpp  | 21 ++++++++++
 .../include-cleaner/lib/Analysis.cpp          | 22 +++++++++--
 .../unittests/AnalysisTest.cpp                | 38 ++++++++++++++-----
 clang/include/clang/Testing/TestAST.h         |  3 ++
 clang/lib/Testing/TestAST.cpp                 |  5 ++-
 6 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index 98135529f259b..ee470bd8b963f 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -93,8 +93,6 @@ bool isFilteredByConfig(const Config &Cfg, llvm::StringRef HeaderPath) {
 static bool mayConsiderUnused(const Inclusion &Inc, ParsedAST &AST,
                               const Config &Cfg,
                               const include_cleaner::PragmaIncludes *PI) {
-  if (PI && PI->shouldKeep(Inc.HashLine + 1))
-    return false;
   // FIXME(kirillbobyrev): We currently do not support the umbrella headers.
   // System headers are likely to be standard library headers.
   // Until we have good support for umbrella headers, don't warn about them.
@@ -108,6 +106,20 @@ static bool mayConsiderUnused(const Inclusion &Inc, ParsedAST &AST,
   auto FE = AST.getSourceManager().getFileManager().getFileRef(
       AST.getIncludeStructure().getRealPath(HID));
   assert(FE);
+  if (PI) {
+    if (PI->shouldKeep(Inc.HashLine + 1))
+      return false;
+    // Check if main file is the public interface for a private header. If so we
+    // shouldn't diagnose it as unused.
+    if(auto PHeader = PI->getPublic(*FE); !PHeader.empty()) {
+      PHeader = PHeader.trim("<>\"");
+      // Since most private -> public mappings happen in a verbatim way, we
+      // check textually here. This might go wrong in presence of symlinks or
+      // header mappings. But that's not different than rest of the places.
+      if(AST.tuPath().endswith(PHeader))
+        return false;
+    }
+  }
   // Headers without include guards have side effects and are not
   // self-contained, skip them.
   if (!AST.getPreprocessor().getHeaderSearchInfo().isFileMultipleIncludeGuarded(
diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
index 409e3cee791c3..69b4e07439c38 100644
--- a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
@@ -30,6 +30,7 @@
 #include "gtest/gtest.h"
 #include <cstddef>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace clang {
@@ -328,6 +329,26 @@ TEST(IncludeCleaner, NoDiagsForObjC) {
   ParsedAST AST = TU.build();
   EXPECT_THAT(AST.getDiagnostics(), llvm::ValueIs(IsEmpty()));
 }
+
+TEST(IncludeCleaner, UmbrellaUsesPrivate) {
+  TestTU TU;
+  TU.Code = R"cpp(
+    #include "private.h"
+    )cpp";
+  TU.AdditionalFiles["private.h"] = guard(R"cpp(
+    // IWYU pragma: private, include "public.h"
+    void foo() {}
+  )cpp");
+  TU.Filename = "public.h";
+  Config Cfg;
+  Cfg.Diagnostics.UnusedIncludes = Config::IncludesPolicy::Strict;
+  WithContextValue Ctx(Config::Key, std::move(Cfg));
+  ParsedAST AST = TU.build();
+  EXPECT_THAT(AST.getDiagnostics(), llvm::ValueIs(IsEmpty()));
+  IncludeCleanerFindings Findings = computeIncludeCleanerFindings(AST);
+  EXPECT_THAT(Findings.UnusedIncludes, IsEmpty());
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/include-cleaner/lib/Analysis.cpp b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
index 6237bdb46babf..fb0879b7aab63 100644
--- a/clang-tools-extra/include-cleaner/lib/Analysis.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
@@ -90,9 +90,25 @@ AnalysisResults analyze(llvm::ArrayRef<Decl *> ASTRoots,
            });
 
   AnalysisResults Results;
-  for (const Include &I : Inc.all())
-    if (!Used.contains(&I) && PI && !PI->shouldKeep(I.Line))
-      Results.Unused.push_back(&I);
+  for (const Include &I : Inc.all()) {
+    if (Used.contains(&I))
+      continue;
+    if (PI) {
+      if (PI->shouldKeep(I.Line))
+        continue;
+      // Check if main file is the public interface for a private header. If so
+      // we shouldn't diagnose it as unused.
+      if (auto PHeader = PI->getPublic(I.Resolved); !PHeader.empty()) {
+        PHeader = PHeader.trim("<>\"");
+        // Since most private -> public mappings happen in a verbatim way, we
+        // check textually here. This might go wrong in presence of symlinks or
+        // header mappings. But that's not different than rest of the places.
+        if (MainFile->tryGetRealPathName().endswith(PHeader))
+          continue;
+      }
+    }
+    Results.Unused.push_back(&I);
+  }
   for (llvm::StringRef S : Missing.keys())
     Results.Missing.push_back(S.str());
   llvm::sort(Results.Missing);
diff --git a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
index c34c6c0a29a81..a2084d4f37903 100644
--- a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
@@ -24,6 +24,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <cstddef>
+#include <vector>
 
 namespace clang::include_cleaner {
 namespace {
@@ -212,17 +213,34 @@ int x = a + c;
     return std::make_unique<Hook>(PP, PI);
   };
 
-  TestAST AST(Inputs);
-  auto Decls = AST.context().getTranslationUnitDecl()->decls();
-  auto Results =
-      analyze(std::vector<Decl *>{Decls.begin(), Decls.end()},
-              PP.MacroReferences, PP.Includes, &PI, AST.sourceManager(),
-              AST.preprocessor().getHeaderSearchInfo());
+  {
+    TestAST AST(Inputs);
+    auto Decls = AST.context().getTranslationUnitDecl()->decls();
+    auto Results =
+        analyze(std::vector<Decl *>{Decls.begin(), Decls.end()},
+                PP.MacroReferences, PP.Includes, &PI, AST.sourceManager(),
+                AST.preprocessor().getHeaderSearchInfo());
+
+    const Include *B = PP.Includes.atLine(3);
+    ASSERT_EQ(B->Spelled, "b.h");
+    EXPECT_THAT(Results.Missing, ElementsAre("\"c.h\""));
+    EXPECT_THAT(Results.Unused, ElementsAre(B));
+  }
 
-  const Include *B = PP.Includes.atLine(3);
-  ASSERT_EQ(B->Spelled, "b.h");
-  EXPECT_THAT(Results.Missing, ElementsAre("\"c.h\""));
-  EXPECT_THAT(Results.Unused, ElementsAre(B));
+  // Check that umbrella header uses private include.
+  {
+    Inputs.Code = R"cpp(#include "private.h")cpp";
+    Inputs.ExtraFiles["private.h"] =
+        guard("// IWYU pragma: private, include \"public.h\"");
+    Inputs.FileName = "public.h";
+    PP.Includes = {};
+    PI = {};
+    TestAST AST(Inputs);
+    EXPECT_FALSE(PP.Includes.all().empty());
+    auto Results = analyze({}, {}, PP.Includes, &PI, AST.sourceManager(),
+                           AST.preprocessor().getHeaderSearchInfo());
+    EXPECT_THAT(Results.Unused, testing::IsEmpty());
+  }
 }
 
 TEST(FixIncludes, Basic) {
diff --git a/clang/include/clang/Testing/TestAST.h b/clang/include/clang/Testing/TestAST.h
index 7ba2ca882b91c..845e31f65438b 100644
--- a/clang/include/clang/Testing/TestAST.h
+++ b/clang/include/clang/Testing/TestAST.h
@@ -49,6 +49,9 @@ struct TestInputs {
   /// Keys are plain filenames ("foo.h"), values are file content.
   llvm::StringMap<std::string> ExtraFiles = {};
 
+  /// Filename to use for translation unit. A default will be used when empty.
+  std::string FileName;
+
   /// By default, error diagnostics during parsing are reported as gtest errors.
   /// To suppress this, set ErrorOK or include "error-ok" in a comment in Code.
   /// In either case, all diagnostics appear in TestAST::diagnostics().
diff --git a/clang/lib/Testing/TestAST.cpp b/clang/lib/Testing/TestAST.cpp
index 8c79fcd7d6363..3a50c2d9b5d05 100644
--- a/clang/lib/Testing/TestAST.cpp
+++ b/clang/lib/Testing/TestAST.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/VirtualFileSystem.h"
 
 #include "gtest/gtest.h"
+#include <string>
 
 namespace clang {
 namespace {
@@ -91,7 +92,9 @@ TestAST::TestAST(const TestInputs &In) {
     Argv.push_back(S.c_str());
   for (const auto &S : In.ExtraArgs)
     Argv.push_back(S.c_str());
-  std::string Filename = getFilenameForTesting(In.Language).str();
+  std::string Filename = In.FileName;
+  if (Filename.empty())
+    Filename = getFilenameForTesting(In.Language).str();
   Argv.push_back(Filename.c_str());
   Clang->setInvocation(std::make_unique<CompilerInvocation>());
   if (!CompilerInvocation::CreateFromArgs(Clang->getInvocation(), Argv,

From 6aa7cc037f2f95c237c1d82c523f8857fa3a10c3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Mar 2023 12:18:45 +0000
Subject: [PATCH 111/208] [X86] LowerVectorAllZero - add 512-bit support with
 AVX512 vptestnmd+kortestw patterns

Another step toward #53419 - this is also another step towards expanding MatchVectorAllZeroTest to match any pair of vectors and merge EmitAVX512Test into it.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 13 +++++-
 llvm/test/CodeGen/X86/ptest.ll                | 15 +++---
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 46 ++++++++-----------
 3 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e828fe4b9dd15..e006388b6e928 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24192,14 +24192,23 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
                        DAG.getConstant(0, DL, IntVT));
   }
 
-  // Split down to 128/256-bit vector.
-  unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
+  // Split down to 128/256/512-bit vector.
+  unsigned TestSize =
+      Subtarget.useAVX512Regs() ? 512 : (Subtarget.hasAVX() ? 256 : 128);
   while (VT.getSizeInBits() > TestSize) {
     auto Split = DAG.SplitVector(V, DL);
     VT = Split.first.getValueType();
     V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
   }
 
+  bool UseKORTEST = Subtarget.useAVX512Regs();
+  if (UseKORTEST && VT.is512BitVector()) {
+    V = DAG.getBitcast(MVT::v16i32, MaskBits(V));
+    V = DAG.getSetCC(DL, MVT::v16i1, V,
+                     getZeroVector(MVT::v16i32, Subtarget, DAG, DL), ISD::SETEQ);
+    return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
+  }
+
   bool UsePTEST = Subtarget.hasSSE41();
   if (UsePTEST) {
     MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 066cbb6193317..c417c5d15b874 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -148,9 +148,8 @@ define i32 @veccond512(<16 x i32> %input) {
 ;
 ; AVX512-LABEL: veccond512:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    je .LBB2_2
 ; AVX512-NEXT:  # %bb.1: # %if-true-block
 ; AVX512-NEXT:    xorl %eax, %eax
@@ -268,10 +267,9 @@ define i32 @vectest512(<16 x i32> %input) {
 ;
 ; AVX512-LABEL: vectest512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -380,9 +378,8 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
 ; AVX512-LABEL: vecsel512:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movl %edi, %eax
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    cmovel %esi, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index fcb0ab6090398..5d921c0aa2c62 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -105,9 +105,8 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ;
 ; AVX512-LABEL: test_v8i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -169,9 +168,8 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-LABEL: test_v16i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -298,9 +296,8 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ;
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -362,9 +359,8 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -510,9 +506,8 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ;
 ; AVX512-LABEL: test_v32i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -574,9 +569,8 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-LABEL: test_v64i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -741,9 +735,8 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ;
 ; AVX512-LABEL: test_v64i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -805,9 +798,8 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-LABEL: test_v128i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1014,10 +1006,8 @@ define i1 @mask_v128i8(<128 x i8> %a0) {
 ; AVX512-LABEL: mask_v128i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673]
-; AVX512-NEXT:    vptest %ymm1, %ymm0
+; AVX512-NEXT:    vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq

From c39dd7c1db97fa367cb6282067b74cd8e55ef09a Mon Sep 17 00:00:00 2001
From: Job Noorman <jnoorman@igalia.com>
Date: Thu, 23 Mar 2023 12:17:57 +0000
Subject: [PATCH 112/208] [RISCV][MC] Add support for RV64E

Implement MC support for the recently ratified RV64E base instruction
set.

Differential Revision: https://reviews.llvm.org/D143570
---
 clang/test/Driver/riscv-arch.c                | 10 ------
 clang/test/Driver/riscv-features.c            |  4 ---
 llvm/docs/RISCVUsage.rst                      |  8 ++---
 llvm/docs/ReleaseNotes.rst                    |  1 +
 llvm/lib/Support/RISCVISAInfo.cpp             | 23 ++++--------
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 16 ++++-----
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |  4 +--
 .../RISCV/MCTargetDesc/RISCVBaseInfo.cpp      | 14 +++++---
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |  1 +
 .../RISCV/MCTargetDesc/RISCVELFStreamer.cpp   |  1 +
 .../MCTargetDesc/RISCVTargetStreamer.cpp      |  8 ++---
 llvm/lib/Target/RISCV/RISCVFeatures.td        | 10 +++---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  4 +--
 .../RISCV/mattr-invalid-combination.ll        |  5 ---
 llvm/test/CodeGen/RISCV/rv32e.ll              |  7 ----
 llvm/test/CodeGen/RISCV/rve.ll                |  8 +++++
 llvm/test/MC/RISCV/attribute-arch.s           |  6 +++-
 llvm/test/MC/RISCV/elf-flags.s                |  3 ++
 llvm/test/MC/RISCV/invalid-attribute.s        |  2 +-
 .../test/MC/RISCV/mattr-invalid-combination.s |  4 ---
 llvm/test/MC/RISCV/rv32e-invalid.s            |  6 +++-
 llvm/test/MC/RISCV/rv32e-valid.s              |  5 +++
 llvm/test/MC/RISCV/rv64e-valid.s              | 36 +++++++++++++++++++
 llvm/test/MC/RISCV/target-abi-invalid.s       | 17 +++++++++
 llvm/test/MC/RISCV/target-abi-valid.s         |  4 +++
 llvm/unittests/Support/RISCVISAInfoTest.cpp   | 15 +++++---
 26 files changed, 139 insertions(+), 83 deletions(-)
 delete mode 100644 llvm/test/CodeGen/RISCV/mattr-invalid-combination.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/rv32e.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rve.ll
 delete mode 100644 llvm/test/MC/RISCV/mattr-invalid-combination.s
 create mode 100644 llvm/test/MC/RISCV/rv64e-valid.s

diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index 610f79d64ada2..cbc1464cbcd6f 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -198,11 +198,6 @@
 
 // Testing specific messages and unsupported extensions.
 
-// RUN: %clang --target=riscv64-unknown-elf -march=rv64e -### %s \
-// RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV64E %s
-// RV64E: error: invalid arch name 'rv64e',
-// RV64E: standard user-level extension 'e' requires 'rv32'
-
 // RUN: %clang --target=riscv32-unknown-elf -march=rv32imC -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-LOWER %s
 // RV32-LOWER: error: invalid arch name 'rv32imC',
@@ -223,11 +218,6 @@
 // RV32-ORDER: error: invalid arch name 'rv32imcq',
 // RV32-ORDER: standard user-level extension not given in canonical order 'q'
 
-// RUN: %clang --target=riscv32-unknown-elf -march=rv64e -### %s \
-// RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV64-EER %s
-// RV64-EER: error: invalid arch name 'rv64e',
-// RV64-EER: standard user-level extension 'e' requires 'rv32'
-
 // RUN: %clang --target=riscv32-unknown-elf -march=rv32izve32f -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-ZVE32F-ER %s
 // RV32-ZVE32F-ER: error: invalid arch name 'rv32izve32f',
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index 98445b1920301..b189fdeacec8c 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -33,10 +33,6 @@
 // DEFAULT-LINUX-SAME: "-target-feature" "+d"
 // DEFAULT-LINUX-SAME: "-target-feature" "+c"
 
-// RUN: not %clang -cc1 -triple riscv64-unknown-elf -target-feature +e 2>&1 | FileCheck %s -check-prefix=RV64-WITH-E
-
-// RV64-WITH-E: error: invalid feature combination: standard user-level extension 'e' requires 'rv32'
-
 // RUN: not %clang -c --target=riscv64-linux-gnu -gsplit-dwarf %s 2>&1 | FileCheck %s --check-prefix=ERR-SPLIT-DWARF
 // RUN: not %clang -c --target=riscv64 -gsplit-dwarf=single %s 2>&1 | FileCheck %s --check-prefix=ERR-SPLIT-DWARF
 // RUN: %clang -### -c --target=riscv64 -mno-relax -g -gsplit-dwarf %s 2>&1 | FileCheck %s --check-prefix=SPLIT-DWARF
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 429b59a8d9404..ffd1028cedd80 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -15,9 +15,9 @@ supported variations of the RISC-V specification.  It lives in the
 Base ISAs
 =========
 
-The specification defines four base instruction sets: RV32I, RV32E, RV64I,
-and RV128I. Currently, LLVM fully supports RV32I, and RV64I.  RV32E is
-supported by the assembly-based tools only.  RV128I is not supported.
+The specification defines five base instruction sets: RV32I, RV32E, RV64I,
+RV64E, and RV128I. Currently, LLVM fully supports RV32I, and RV64I.  RV32E and
+RV64E are supported by the assembly-based tools only.  RV128I is not supported.
 
 To specify the target triple:
 
@@ -27,7 +27,7 @@ To specify the target triple:
      Architecture Description
      ============ ==============================================================
      ``riscv32``   RISC-V with XLEN=32 (i.e. RV32I or RV32E)
-     ``riscv64``   RISC-V with XLEN=64 (i.e. RV64I)
+     ``riscv64``   RISC-V with XLEN=64 (i.e. RV64I or RV64E)
      ============ ==============================================================
 
 To select an E variant ISA (e.g. RV32E instead of RV32I), use the base
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index d87d20704f166..525f57a90dfb0 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -144,6 +144,7 @@ Changes to the RISC-V Backend
 * Adds support for the vendor-defined XTHeadCmo (cache management operations) extension.
 * Adds support for the vendor-defined XTHeadSync (multi-core synchronization instructions) extension.
 * Added support for the vendor-defined XTHeadFMemIdx (indexed memory operations for floating point) extension.
+* Assembler support for RV64E was added.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 93cf66ff1f739..35c249a7b3703 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -584,8 +584,9 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
   bool HasRV64 = Arch.startswith("rv64");
   // ISA string must begin with rv32 or rv64.
   if (!(Arch.startswith("rv32") || HasRV64) || (Arch.size() < 5)) {
-    return createStringError(errc::invalid_argument,
-                             "string must begin with rv32{i,e,g} or rv64{i,g}");
+    return createStringError(
+        errc::invalid_argument,
+        "string must begin with rv32{i,e,g} or rv64{i,e,g}");
   }
 
   unsigned XLen = HasRV64 ? 64 : 32;
@@ -601,14 +602,7 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
   default:
     return createStringError(errc::invalid_argument,
                              "first letter should be 'e', 'i' or 'g'");
-  case 'e': {
-    // Extension 'e' is not allowed in rv64.
-    if (HasRV64)
-      return createStringError(
-          errc::invalid_argument,
-          "standard user-level extension 'e' requires 'rv32'");
-    break;
-  }
+  case 'e':
   case 'i':
     break;
   case 'g':
@@ -828,8 +822,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
 }
 
 Error RISCVISAInfo::checkDependency() {
-  bool IsRv32 = XLen == 32;
-  bool HasE = Exts.count("e") != 0;
   bool HasD = Exts.count("d") != 0;
   bool HasF = Exts.count("f") != 0;
   bool HasZfinx = Exts.count("zfinx") != 0;
@@ -839,11 +831,6 @@ Error RISCVISAInfo::checkDependency() {
   bool HasZve64d = Exts.count("zve64d") != 0;
   bool HasZvl = MinVLen != 0;
 
-  if (HasE && !IsRv32)
-    return createStringError(
-        errc::invalid_argument,
-        "standard user-level extension 'e' requires 'rv32'");
-
   if (HasF && HasZfinx)
     return createStringError(errc::invalid_argument,
                              "'f' and 'zfinx' extensions are incompatible");
@@ -1115,6 +1102,8 @@ StringRef RISCVISAInfo::computeDefaultABI() const {
   } else if (XLen == 64) {
     if (hasExtension("d"))
       return "lp64d";
+    if (hasExtension("e"))
+      return "lp64e";
     return "lp64";
   }
   llvm_unreachable("Invalid XLEN");
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index d984f39321a6e..1627761052284 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -67,7 +67,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
 
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
   bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
-  bool isRV32E() const { return getSTI().hasFeature(RISCV::FeatureRV32E); }
+  bool isRVE() const { return getSTI().hasFeature(RISCV::FeatureRVE); }
 
   RISCVTargetStreamer &getTargetStreamer() {
     assert(getParser().getStreamer().getTargetStreamer() &&
@@ -1352,9 +1352,9 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
 // Attempts to match Name as a register (either using the default name or
 // alternative ABI names), setting RegNo to the matching register. Upon
-// failure, returns true and sets RegNo to 0. If IsRV32E then registers
+// failure, returns true and sets RegNo to 0. If IsRVE then registers
 // x16-x31 will be rejected.
-static bool matchRegisterNameHelper(bool IsRV32E, MCRegister &RegNo,
+static bool matchRegisterNameHelper(bool IsRVE, MCRegister &RegNo,
                                     StringRef Name) {
   RegNo = MatchRegisterName(Name);
   // The 16-/32- and 64-bit FPRs have the same asm name. Check that the initial
@@ -1366,7 +1366,7 @@ static bool matchRegisterNameHelper(bool IsRV32E, MCRegister &RegNo,
   static_assert(RISCV::F0_D < RISCV::F0_F, "FPR matching must be updated");
   if (RegNo == RISCV::NoRegister)
     RegNo = MatchRegisterAltName(Name);
-  if (IsRV32E && RegNo >= RISCV::X16 && RegNo <= RISCV::X31)
+  if (IsRVE && RegNo >= RISCV::X16 && RegNo <= RISCV::X31)
     RegNo = RISCV::NoRegister;
   return RegNo == RISCV::NoRegister;
 }
@@ -1387,7 +1387,7 @@ OperandMatchResultTy RISCVAsmParser::tryParseRegister(MCRegister &RegNo,
   RegNo = 0;
   StringRef Name = getLexer().getTok().getIdentifier();
 
-  if (matchRegisterNameHelper(isRV32E(), (MCRegister &)RegNo, Name))
+  if (matchRegisterNameHelper(isRVE(), (MCRegister &)RegNo, Name))
     return MatchOperand_NoMatch;
 
   getParser().Lex(); // Eat identifier token.
@@ -1420,7 +1420,7 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
   case AsmToken::Identifier:
     StringRef Name = getLexer().getTok().getIdentifier();
     MCRegister RegNo;
-    matchRegisterNameHelper(isRV32E(), RegNo, Name);
+    matchRegisterNameHelper(isRVE(), RegNo, Name);
 
     if (RegNo == RISCV::NoRegister) {
       if (HadParens)
@@ -1908,7 +1908,7 @@ OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
   MCRegister RegNo;
-  matchRegisterNameHelper(isRV32E(), RegNo, Name);
+  matchRegisterNameHelper(isRVE(), RegNo, Name);
 
   if (RegNo == RISCV::NoRegister)
     return MatchOperand_NoMatch;
@@ -1927,7 +1927,7 @@ OperandMatchResultTy RISCVAsmParser::parseGPRAsFPR(OperandVector &Operands) {
 
   StringRef Name = getLexer().getTok().getIdentifier();
   MCRegister RegNo;
-  matchRegisterNameHelper(isRV32E(), RegNo, Name);
+  matchRegisterNameHelper(isRVE(), RegNo, Name);
 
   if (RegNo == RISCV::NoRegister)
     return MatchOperand_NoMatch;
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 15352c1c0885d..2d01d6df3a198 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -61,9 +61,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVDisassembler() {
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint32_t RegNo,
                                            uint64_t Address,
                                            const MCDisassembler *Decoder) {
-  bool IsRV32E = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureRV32E);
+  bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureRVE);
 
-  if (RegNo >= 32 || (IsRV32E && RegNo >= 16))
+  if (RegNo >= 32 || (IsRVE && RegNo >= 16))
     return MCDisassembler::Fail;
 
   MCRegister Reg = RISCV::X0 + RegNo;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index 98c8e883e5960..8f891a04def53 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -40,7 +40,7 @@ ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
                      StringRef ABIName) {
   auto TargetABI = getTargetABI(ABIName);
   bool IsRV64 = TT.isArch64Bit();
-  bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
+  bool IsRVE = FeatureBits[RISCV::FeatureRVE];
 
   if (!ABIName.empty() && TargetABI == ABI_Unknown) {
     errs()
@@ -54,11 +54,18 @@ ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
     errs() << "64-bit ABIs are not supported for 32-bit targets (ignoring "
               "target-abi)\n";
     TargetABI = ABI_Unknown;
-  } else if (IsRV32E && TargetABI != ABI_ILP32E && TargetABI != ABI_Unknown) {
+  } else if (!IsRV64 && IsRVE && TargetABI != ABI_ILP32E &&
+             TargetABI != ABI_Unknown) {
     // TODO: move this checking to RISCVTargetLowering and RISCVAsmParser
     errs()
         << "Only the ilp32e ABI is supported for RV32E (ignoring target-abi)\n";
     TargetABI = ABI_Unknown;
+  } else if (IsRV64 && IsRVE && TargetABI != ABI_LP64E &&
+             TargetABI != ABI_Unknown) {
+    // TODO: move this checking to RISCVTargetLowering and RISCVAsmParser
+    errs()
+        << "Only the lp64e ABI is supported for RV64E (ignoring target-abi)\n";
+    TargetABI = ABI_Unknown;
   }
 
   if (TargetABI != ABI_Unknown)
@@ -80,6 +87,7 @@ ABI getTargetABI(StringRef ABIName) {
                        .Case("lp64", ABI_LP64)
                        .Case("lp64f", ABI_LP64F)
                        .Case("lp64d", ABI_LP64D)
+                       .Case("lp64e", ABI_LP64E)
                        .Default(ABI_Unknown);
   return TargetABI;
 }
@@ -101,8 +109,6 @@ void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
     report_fatal_error("RV64 target requires an RV64 CPU");
   if (!TT.isArch64Bit() && !FeatureBits[RISCV::Feature32Bit])
     report_fatal_error("RV32 target requires an RV32 CPU");
-  if (TT.isArch64Bit() && FeatureBits[RISCV::FeatureRV32E])
-    report_fatal_error("RV32E can't be enabled for an RV64 target");
   if (FeatureBits[RISCV::Feature32Bit] &&
       FeatureBits[RISCV::Feature64Bit])
     report_fatal_error("RV32 and RV64 can't be combined");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 70fdc0e4ff120..175059fdf08e5 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -408,6 +408,7 @@ enum ABI {
   ABI_LP64,
   ABI_LP64F,
   ABI_LP64D,
+  ABI_LP64E,
   ABI_Unknown
 };
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index a05254b0ae579..356bb895c6ed4 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -103,6 +103,7 @@ void RISCVTargetELFStreamer::finish() {
     EFlags |= ELF::EF_RISCV_FLOAT_ABI_DOUBLE;
     break;
   case RISCVABI::ABI_ILP32E:
+  case RISCVABI::ABI_LP64E:
     EFlags |= ELF::EF_RISCV_RVE;
     break;
   case RISCVABI::ABI_Unknown:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 756cc14a87014..f7bcc197b1872 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -47,10 +47,10 @@ void RISCVTargetStreamer::setTargetABI(RISCVABI::ABI ABI) {
 }
 
 void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
-  if (STI.hasFeature(RISCV::FeatureRV32E))
-    emitAttribute(RISCVAttrs::STACK_ALIGN, RISCVAttrs::ALIGN_4);
-  else
-    emitAttribute(RISCVAttrs::STACK_ALIGN, RISCVAttrs::ALIGN_16);
+  if (STI.hasFeature(RISCV::FeatureRVE))
+    report_fatal_error("Codegen not yet implemented for RVE");
+
+  emitAttribute(RISCVAttrs::STACK_ALIGN, RISCVAttrs::ALIGN_16);
 
   auto ParseResult = RISCVFeatures::parseFeatureBits(
       STI.hasFeature(RISCV::Feature64Bit), STI.getFeatureBits());
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index b0bb2992f6b42..0cf73bc37be84 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -589,11 +589,11 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
 defvar RV32 = DefaultMode;
 def RV64           : HwMode<"+64bit", [IsRV64]>;
 
-def FeatureRV32E
-    : SubtargetFeature<"e", "IsRV32E", "true",
-                       "Implements RV32E (provides 16 rather than 32 GPRs)">;
-def IsRV32E : Predicate<"Subtarget->isRV32E()">,
-                        AssemblerPredicate<(all_of FeatureRV32E)>;
+def FeatureRVE
+    : SubtargetFeature<"e", "IsRVE", "true",
+                       "Implements RV{32,64}E (provides 16 rather than 32 GPRs)">;
+def IsRVE : Predicate<"Subtarget->isRVE()">,
+                        AssemblerPredicate<(all_of FeatureRVE)>;
 
 def FeatureRelax
     : SubtargetFeature<"relax", "EnableLinkerRelax", "true",
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 595e094662f9a..9310c8161cd46 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -76,8 +76,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                                          const RISCVSubtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
 
-  if (Subtarget.isRV32E())
-    report_fatal_error("Codegen not yet implemented for RV32E");
+  if (Subtarget.isRVE())
+    report_fatal_error("Codegen not yet implemented for RVE");
 
   RISCVABI::ABI ABI = Subtarget.getTargetABI();
   assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");
diff --git a/llvm/test/CodeGen/RISCV/mattr-invalid-combination.ll b/llvm/test/CodeGen/RISCV/mattr-invalid-combination.ll
deleted file mode 100644
index e5bdb96fd0741..0000000000000
--- a/llvm/test/CodeGen/RISCV/mattr-invalid-combination.ll
+++ /dev/null
@@ -1,5 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: not --crash llc -mtriple=riscv64 -mattr=+e < %s 2>&1 \
-; RUN:   | FileCheck -check-prefix=RV64E %s
-
-; RV64E: LLVM ERROR: RV32E can't be enabled for an RV64 target
diff --git a/llvm/test/CodeGen/RISCV/rv32e.ll b/llvm/test/CodeGen/RISCV/rv32e.ll
deleted file mode 100644
index 88379ab438725..0000000000000
--- a/llvm/test/CodeGen/RISCV/rv32e.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: not --crash llc -mtriple=riscv32 -mattr=+e < %s 2>&1 | FileCheck %s
-
-; CHECK: LLVM ERROR: Codegen not yet implemented for RV32E
-
-define void @nothing() nounwind {
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rve.ll b/llvm/test/CodeGen/RISCV/rve.ll
new file mode 100644
index 0000000000000..29b9bab61f7ff
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rve.ll
@@ -0,0 +1,8 @@
+; RUN: not --crash llc -mtriple=riscv32 -mattr=+e < %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=riscv64 -mattr=+e < %s 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Codegen not yet implemented for RVE
+
+define void @nothing() nounwind {
+  ret void
+}
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index af0b3fe0cdc29..db1856e1c6677 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -1,7 +1,8 @@
 ## Arch string without version.
 
 # RUN: llvm-mc %s -triple=riscv32 -filetype=asm | FileCheck %s
-# RUN: llvm-mc %s -triple=riscv64 -filetype=asm | FileCheck %s
+# RUN: llvm-mc %s -triple=riscv64 -filetype=asm \
+# RUN:     | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s
 
 .attribute arch, "rv32i"
 # CHECK: attribute      5, "rv32i2p0"
@@ -15,6 +16,9 @@
 .attribute arch, "rv32e"
 # CHECK: attribute      5, "rv32e2p0"
 
+.attribute arch, "rv64e"
+# CHECK-RV64: attribute      5, "rv64e2p0"
+
 .attribute arch, "rv32i2_m2"
 # CHECK: attribute      5, "rv32i2p0_m2p0"
 
diff --git a/llvm/test/MC/RISCV/elf-flags.s b/llvm/test/MC/RISCV/elf-flags.s
index 543eadede1f70..546e129fb7194 100644
--- a/llvm/test/MC/RISCV/elf-flags.s
+++ b/llvm/test/MC/RISCV/elf-flags.s
@@ -5,6 +5,9 @@
 # RUN: llvm-mc -triple=riscv32 -mattr=+e -filetype=obj < %s \
 # RUN:   | llvm-readobj --file-headers - \
 # RUN:   | FileCheck -check-prefix=CHECK-RVE %s
+# RUN: llvm-mc -triple=riscv64 -mattr=+e -filetype=obj < %s \
+# RUN:   | llvm-readobj --file-headers - \
+# RUN:   | FileCheck -check-prefix=CHECK-RVE %s
 # RUN: llvm-mc -triple=riscv32 -mattr=+experimental-ztso -filetype=obj < %s | llvm-readobj --file-headers - | FileCheck -check-prefixes=CHECK-TSO %s
 # RUN: llvm-mc -triple=riscv64 -mattr=+experimental-ztso -filetype=obj < %s | llvm-readobj --file-headers - | FileCheck -check-prefixes=CHECK-TSO %s
 
diff --git a/llvm/test/MC/RISCV/invalid-attribute.s b/llvm/test/MC/RISCV/invalid-attribute.s
index 761a98902d5ef..3514452997266 100644
--- a/llvm/test/MC/RISCV/invalid-attribute.s
+++ b/llvm/test/MC/RISCV/invalid-attribute.s
@@ -7,7 +7,7 @@
 # RUN: not llvm-mc %s -triple=riscv64 -filetype=asm 2>&1 | FileCheck %s
 
 .attribute arch, "foo"
-# CHECK: [[@LINE-1]]:18: error: invalid arch name 'foo', string must begin with rv32{i,e,g} or rv64{i,g}
+# CHECK: [[@LINE-1]]:18: error: invalid arch name 'foo', string must begin with rv32{i,e,g} or rv64{i,e,g}
 
 .attribute arch, "rv32i2p0_y2p0"
 # CHECK: [[@LINE-1]]:18: error: invalid arch name 'rv32i2p0_y2p0', invalid standard user-level extension 'y'
diff --git a/llvm/test/MC/RISCV/mattr-invalid-combination.s b/llvm/test/MC/RISCV/mattr-invalid-combination.s
deleted file mode 100644
index f75fd3723ed49..0000000000000
--- a/llvm/test/MC/RISCV/mattr-invalid-combination.s
+++ /dev/null
@@ -1,4 +0,0 @@
-# RUN: not --crash llvm-mc -triple riscv64 -mattr=+e < %s 2>&1 \
-# RUN:   | FileCheck %s -check-prefix=RV64E
-
-# RV64E: LLVM ERROR: RV32E can't be enabled for an RV64 target
diff --git a/llvm/test/MC/RISCV/rv32e-invalid.s b/llvm/test/MC/RISCV/rv32e-invalid.s
index 760e7d49e5c46..9c19d3f40bcff 100644
--- a/llvm/test/MC/RISCV/rv32e-invalid.s
+++ b/llvm/test/MC/RISCV/rv32e-invalid.s
@@ -2,9 +2,13 @@
 # RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \
 # RUN:     | llvm-objdump --mattr=+e -M no-aliases -d -r - \
 # RUN:     | FileCheck -check-prefix=CHECK-DIS %s
+# RUN: not llvm-mc -triple riscv64 -mattr=+e < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 < %s \
+# RUN:     | llvm-objdump --mattr=+e -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefix=CHECK-DIS %s
 
 # Perform a simple check that registers x16-x31 (and the equivalent ABI names)
-# are rejected for RV32E, when both assembling and disassembling.
+# are rejected for RV32E/RV64E, when both assembling and disassembling.
 
 
 # CHECK-DIS: 37 18 00 00 <unknown>
diff --git a/llvm/test/MC/RISCV/rv32e-valid.s b/llvm/test/MC/RISCV/rv32e-valid.s
index b0f435da50544..c2b77736d92b5 100644
--- a/llvm/test/MC/RISCV/rv32e-valid.s
+++ b/llvm/test/MC/RISCV/rv32e-valid.s
@@ -3,6 +3,11 @@
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+e < %s \
 # RUN:     | llvm-objdump -M no-aliases -d -r - \
 # RUN:     | FileCheck -check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+e -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+e < %s \
+# RUN:     | llvm-objdump -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
 
 # This file provides a basic test for RV32E, checking that the expected
 # set of registers and instructions are accepted.
diff --git a/llvm/test/MC/RISCV/rv64e-valid.s b/llvm/test/MC/RISCV/rv64e-valid.s
new file mode 100644
index 0000000000000..4780fd6ece4ab
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64e-valid.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+e -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+e < %s \
+# RUN:     | llvm-objdump -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
+
+# This file provides a basic test for RV64E, checking that the expected
+# set of registers and instructions are accepted. It only tests instructions
+# that are not valid in RV32E.
+
+# CHECK-ASM-AND-OBJ: ld a4, 25(a5)
+ld x14, 25(x15)
+# CHECK-ASM-AND-OBJ: sd a2, 36(a3)
+sd a2, 36(a3)
+
+# CHECK-ASM-AND-OBJ: addiw a4, a5, 37
+addiw a4, a5, 37
+# CHECK-ASM-AND-OBJ: slliw t1, t1, 31
+slliw t1, t1, 31
+# CHECK-ASM-AND-OBJ: srliw a0, a4, 0
+srliw a0, a4, 0
+# CHECK-ASM-AND-OBJ: sraiw a1, sp, 15
+sraiw a1, sp, 15
+# CHECK-ASM-AND-OBJ: slliw t0, t1, 13
+slliw t0, t1, 13
+
+# CHECK-ASM-AND-OBJ: addw ra, zero, zero
+addw ra, zero, zero
+# CHECK-ASM-AND-OBJ: subw t0, t2, t1
+subw t0, t2, t1
+# CHECK-ASM-AND-OBJ: sllw a5, a4, a3
+sllw a5, a4, a3
+# CHECK-ASM-AND-OBJ: srlw a0, s0, t0
+srlw a0, s0, t0
+# CHECK-ASM-AND-OBJ: sraw t0, a3, zero
+sraw t0, a3, zero
diff --git a/llvm/test/MC/RISCV/target-abi-invalid.s b/llvm/test/MC/RISCV/target-abi-invalid.s
index 20e9f89153e05..d7dba182fd166 100644
--- a/llvm/test/MC/RISCV/target-abi-invalid.s
+++ b/llvm/test/MC/RISCV/target-abi-invalid.s
@@ -32,6 +32,8 @@
 # RUN:   | FileCheck -check-prefix=RV32EF-LP64F %s
 # RUN: llvm-mc -triple=riscv32 -mattr=+e,+d -target-abi lp64f < %s 2>&1 \
 # RUN:   | FileCheck -check-prefix=RV32EFD-LP64D %s
+# RUN: llvm-mc -triple=riscv32 -mattr=+e -target-abi lp64e %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=RV32E-LP64E %s
 
 # RV32I-LP64: 64-bit ABIs are not supported for 32-bit targets (ignoring target-abi)
 # RV32IF-LP64F: 64-bit ABIs are not supported for 32-bit targets (ignoring target-abi)
@@ -39,6 +41,7 @@
 # RV32E-LP64: 64-bit ABIs are not supported for 32-bit targets (ignoring target-abi)
 # RV32EF-LP64F: 64-bit ABIs are not supported for 32-bit targets (ignoring target-abi)
 # RV32EFD-LP64D: 64-bit ABIs are not supported for 32-bit targets (ignoring target-abi)
+# RV32E-LP64E: 64-bit ABIs are not supported for 32-bit targets (ignoring target-abi)
 
 # RUN: llvm-mc -triple=riscv32 -target-abi ilp32f < %s 2>&1 \
 # RUN:   | FileCheck -check-prefix=RV32I-ILP32F %s
@@ -76,4 +79,18 @@
 # RV32EFD-ILP32F: Only the ilp32e ABI is supported for RV32E (ignoring target-abi)
 # RV32EFD-ILP32D: Only the ilp32e ABI is supported for RV32E (ignoring target-abi)
 
+# RUN: llvm-mc -triple=riscv64 -mattr=+e -target-abi lp64 < %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=RV64EF-LP64F %s
+# RUN: llvm-mc -triple=riscv64 -mattr=+e,+f -target-abi lp64f < %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=RV64EF-LP64F %s
+# RUN: llvm-mc -triple=riscv64 -mattr=+e,+d -target-abi lp64f < %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=RV64EFD-LP64F %s
+# RUN: llvm-mc -triple=riscv64 -mattr=+e,+d -target-abi lp64d < %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=RV64EFD-LP64D %s
+
+# RV64E-LP64: Only the lp64e ABI is supported for RV64E (ignoring target-abi)
+# RV64EF-LP64F: Only the lp64e ABI is supported for RV64E (ignoring target-abi)
+# RV64EFD-LP64F: Only the lp64e ABI is supported for RV64E (ignoring target-abi)
+# RV64EFD-LP64D: Only the lp64e ABI is supported for RV64E (ignoring target-abi)
+
 nop
diff --git a/llvm/test/MC/RISCV/target-abi-valid.s b/llvm/test/MC/RISCV/target-abi-valid.s
index dab4420d0248c..63c0d4bf2e468 100644
--- a/llvm/test/MC/RISCV/target-abi-valid.s
+++ b/llvm/test/MC/RISCV/target-abi-valid.s
@@ -47,6 +47,10 @@
 # RUN:   | llvm-readobj --file-headers - \
 # RUN:   | FileCheck -check-prefix=CHECK-RVE %s
 
+# RUN: llvm-mc -triple=riscv64 -target-abi lp64e -filetype=obj < %s \
+# RUN:   | llvm-readobj --file-headers - \
+# RUN:   | FileCheck -check-prefix=CHECK-RVE %s
+
 # CHECK-NONE:               Flags [ (0x0)
 # CHECK-NONE-NEXT:          ]
 
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index 0b749eb0c6815..05997d2d2d2c4 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -109,7 +109,7 @@ TEST(ParseArchString, RejectsUpperCase) {
 TEST(ParseArchString, RejectsInvalidBaseISA) {
   for (StringRef Input : {"rv32", "rv64", "rv65i"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
-              "string must begin with rv32{i,e,g} or rv64{i,g}");
+              "string must begin with rv32{i,e,g} or rv64{i,e,g}");
   }
   for (StringRef Input : {"rv32j", "rv64k", "rv32_i"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
@@ -118,11 +118,9 @@ TEST(ParseArchString, RejectsInvalidBaseISA) {
 }
 
 TEST(ParseArchString, RejectsUnsupportedBaseISA) {
-  EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64e", true).takeError()),
-            "standard user-level extension 'e' requires 'rv32'");
   for (StringRef Input : {"rv128i", "rv128g"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
-              "string must begin with rv32{i,e,g} or rv64{i,g}");
+              "string must begin with rv32{i,e,g} or rv64{i,e,g}");
   }
 }
 
@@ -167,6 +165,15 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   EXPECT_EQ(InfoRV64I.getXLen(), 64U);
   EXPECT_EQ(InfoRV64I.getFLen(), 0U);
 
+  auto MaybeRV64E = RISCVISAInfo::parseArchString("rv64e", true);
+  ASSERT_THAT_EXPECTED(MaybeRV64E, Succeeded());
+  RISCVISAInfo &InfoRV64E = **MaybeRV64E;
+  RISCVISAInfo::OrderedExtensionMap ExtsRV64E = InfoRV64E.getExtensions();
+  EXPECT_EQ(ExtsRV64E.size(), 1UL);
+  EXPECT_TRUE(ExtsRV64E.at("e") == (RISCVExtensionInfo{2, 0}));
+  EXPECT_EQ(InfoRV64E.getXLen(), 64U);
+  EXPECT_EQ(InfoRV64E.getFLen(), 0U);
+
   auto MaybeRV64G = RISCVISAInfo::parseArchString("rv64g", true);
   ASSERT_THAT_EXPECTED(MaybeRV64G, Succeeded());
   RISCVISAInfo &InfoRV64G = **MaybeRV64G;

From 3d65cd405d64afd86a59c1f58098dfe891841271 Mon Sep 17 00:00:00 2001
From: Yi Kong <yikong@google.com>
Date: Thu, 23 Mar 2023 20:29:17 +0800
Subject: [PATCH 113/208] [llvm-objdump] Fix help message for --print-imm-hex

Commit cc2457ca1bbd changed the default but forgot to update the help message.
---
 llvm/tools/llvm-objdump/ObjdumpOpts.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index de7f883d24a80..c6627c75157b8 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -145,10 +145,10 @@ def reloc : Flag<["--"], "reloc">,
 def : Flag<["-"], "r">, Alias<reloc>, HelpText<"Alias for --reloc">;
 
 def print_imm_hex : Flag<["--"], "print-imm-hex">,
-  HelpText<"Use hex format for immediate values">;
+  HelpText<"Use hex format for immediate values (default)">;
 
 def no_print_imm_hex : Flag<["--"], "no-print-imm-hex">,
-  HelpText<"Do not use hex format for immediate values (default)">;
+  HelpText<"Do not use hex format for immediate values">;
 def : Flag<["--"], "print-imm-hex=false">, Alias<no_print_imm_hex>;
 
 def private_headers : Flag<["--"], "private-headers">,

From 7fef15edd4d35d5f2dcaa8cd32d5c8add028dc67 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Mar 2023 12:37:11 +0000
Subject: [PATCH 114/208] Revert rG6aa7cc037f2f95c237c1d82c523f8857fa3a10c3 -
 "[X86] LowerVectorAllZero - add 512-bit support with AVX512
 vptestnmd+kortestw patterns"

Reverted - I need to adjust the implementation so we can properly refactor it into a "LowerVectorAllEqual" function
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 13 +-----
 llvm/test/CodeGen/X86/ptest.ll                | 15 +++---
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 46 +++++++++++--------
 3 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e006388b6e928..e828fe4b9dd15 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24192,23 +24192,14 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
                        DAG.getConstant(0, DL, IntVT));
   }
 
-  // Split down to 128/256/512-bit vector.
-  unsigned TestSize =
-      Subtarget.useAVX512Regs() ? 512 : (Subtarget.hasAVX() ? 256 : 128);
+  // Split down to 128/256-bit vector.
+  unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
   while (VT.getSizeInBits() > TestSize) {
     auto Split = DAG.SplitVector(V, DL);
     VT = Split.first.getValueType();
     V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
   }
 
-  bool UseKORTEST = Subtarget.useAVX512Regs();
-  if (UseKORTEST && VT.is512BitVector()) {
-    V = DAG.getBitcast(MVT::v16i32, MaskBits(V));
-    V = DAG.getSetCC(DL, MVT::v16i1, V,
-                     getZeroVector(MVT::v16i32, Subtarget, DAG, DL), ISD::SETEQ);
-    return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
-  }
-
   bool UsePTEST = Subtarget.hasSSE41();
   if (UsePTEST) {
     MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index c417c5d15b874..066cbb6193317 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -148,8 +148,9 @@ define i32 @veccond512(<16 x i32> %input) {
 ;
 ; AVX512-LABEL: veccond512:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    je .LBB2_2
 ; AVX512-NEXT:  # %bb.1: # %if-true-block
 ; AVX512-NEXT:    xorl %eax, %eax
@@ -267,9 +268,10 @@ define i32 @vectest512(<16 x i32> %input) {
 ;
 ; AVX512-LABEL: vectest512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -378,8 +380,9 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
 ; AVX512-LABEL: vecsel512:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movl %edi, %eax
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    cmovel %esi, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 5d921c0aa2c62..fcb0ab6090398 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -105,8 +105,9 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ;
 ; AVX512-LABEL: test_v8i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -168,8 +169,9 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-LABEL: test_v16i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -296,8 +298,9 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ;
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -359,8 +362,9 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -506,8 +510,9 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ;
 ; AVX512-LABEL: test_v32i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -569,8 +574,9 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-LABEL: test_v64i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -735,8 +741,9 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ;
 ; AVX512-LABEL: test_v64i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -798,8 +805,9 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-LABEL: test_v128i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptest %ymm0, %ymm0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1006,8 +1014,10 @@ define i1 @mask_v128i8(<128 x i8> %a0) {
 ; AVX512-LABEL: mask_v128i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673]
+; AVX512-NEXT:    vptest %ymm1, %ymm0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq

From c6e9823724ef6bdfee262289ee34d162db436af0 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Fri, 3 Jun 2022 10:43:38 +0100
Subject: [PATCH 115/208] [C++20][Modules] Introduce an implementation module.

We need to be able to distinguish individual TUs from the same module in cases
where TU-local entities either need to be hidden (or, for some cases of ADL in
template instantiation, need to be detected as exposures).

This creates a module type for the implementation which implicitly imports its
primary module interface per C++20:
[module.unit/8] 'A module-declaration that contains neither an export-keyword
nor a module-partition implicitly imports the primary module interface unit of
the module as if by a module-import-declaration.

Implementation modules are never serialized (-emit-module-interface for an
implementation unit is diagnosed and rejected).

Differential Revision: https://reviews.llvm.org/D126959
---
 clang/include/clang/Basic/Module.h            | 28 +++++++--
 clang/include/clang/Lex/ModuleMap.h           | 12 ++++
 clang/include/clang/Sema/Sema.h               |  4 ++
 clang/lib/AST/Decl.cpp                        |  1 +
 clang/lib/CodeGen/CGDeclCXX.cpp               |  6 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  2 +
 clang/lib/Frontend/FrontendActions.cpp        |  2 +
 clang/lib/Lex/ModuleMap.cpp                   | 42 ++++++++++---
 clang/lib/Sema/SemaDecl.cpp                   | 20 ++++---
 clang/lib/Sema/SemaModule.cpp                 | 59 +++++++++++--------
 clang/lib/Serialization/ASTWriter.cpp         |  2 +-
 .../CXX/module/basic/basic.def.odr/p4.cppm    | 10 ++--
 .../test/CXX/module/basic/basic.link/p2.cppm  | 10 ++--
 clang/test/CodeGenCXX/module-intializer.cpp   |  8 +--
 14 files changed, 147 insertions(+), 59 deletions(-)

diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index 387ce4d6e9b17..c0c99eb8b6d62 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -103,16 +103,22 @@ class alignas(8) Module {
   /// The location of the module definition.
   SourceLocation DefinitionLoc;
 
+  // FIXME: Consider if reducing the size of this enum (having Partition and
+  // Named modules only) then representing interface/implementation separately
+  // is more efficient.
   enum ModuleKind {
     /// This is a module that was defined by a module map and built out
     /// of header files.
     ModuleMapModule,
 
+    /// This is a C++ 20 header unit.
+    ModuleHeaderUnit,
+
     /// This is a C++20 module interface unit.
     ModuleInterfaceUnit,
 
-    /// This is a C++ 20 header unit.
-    ModuleHeaderUnit,
+    /// This is a C++20 module implementation unit.
+    ModuleImplementationUnit,
 
     /// This is a C++ 20 module partition interface.
     ModulePartitionInterface,
@@ -169,9 +175,16 @@ class alignas(8) Module {
   /// Does this Module scope describe part of the purview of a standard named
   /// C++ module?
   bool isModulePurview() const {
-    return Kind == ModuleInterfaceUnit || Kind == ModulePartitionInterface ||
-           Kind == ModulePartitionImplementation ||
-           Kind == PrivateModuleFragment;
+    switch (Kind) {
+    case ModuleInterfaceUnit:
+    case ModuleImplementationUnit:
+    case ModulePartitionInterface:
+    case ModulePartitionImplementation:
+    case PrivateModuleFragment:
+      return true;
+    default:
+      return false;
+    }
   }
 
   /// Does this Module scope describe a fragment of the global module within
@@ -561,6 +574,11 @@ class alignas(8) Module {
            Kind == ModulePartitionImplementation;
   }
 
+  /// Is this a module implementation.
+  bool isModuleImplementation() const {
+    return Kind == ModuleImplementationUnit;
+  }
+
   /// Is this module a header unit.
   bool isHeaderUnit() const { return Kind == ModuleHeaderUnit; }
   // Is this a C++20 module interface or a partition.
diff --git a/clang/include/clang/Lex/ModuleMap.h b/clang/include/clang/Lex/ModuleMap.h
index a0ddd13c11bfd..f155c609b06cb 100644
--- a/clang/include/clang/Lex/ModuleMap.h
+++ b/clang/include/clang/Lex/ModuleMap.h
@@ -560,6 +560,11 @@ class ModuleMap {
   Module *createPrivateModuleFragmentForInterfaceUnit(Module *Parent,
                                                       SourceLocation Loc);
 
+  /// Create a new C++ module with the specified kind, and reparent any pending
+  /// global module fragment(s) to it.
+  Module *createModuleUnitWithKind(SourceLocation Loc, StringRef Name,
+                                   Module::ModuleKind Kind);
+
   /// Create a new module for a C++ module interface unit.
   /// The module must not already exist, and will be configured for the current
   /// compilation.
@@ -569,6 +574,13 @@ class ModuleMap {
   /// \returns The newly-created module.
   Module *createModuleForInterfaceUnit(SourceLocation Loc, StringRef Name);
 
+  /// Create a new module for a C++ module implementation unit.
+  /// The interface module for this implementation (implicitly imported) must
+  /// exist and be loaded and present in the modules map.
+  ///
+  /// \returns The newly-created module.
+  Module *createModuleForImplementationUnit(SourceLocation Loc, StringRef Name);
+
   /// Create a C++20 header unit.
   Module *createHeaderUnit(SourceLocation Loc, StringRef Name,
                            Module::Header H);
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 63ee0f0ed7fb6..277c02ee3f1bd 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2274,6 +2274,10 @@ class Sema final {
   };
   /// The modules we're currently parsing.
   llvm::SmallVector<ModuleScope, 16> ModuleScopes;
+
+  /// For an interface unit, this is the implicitly imported interface unit.
+  clang::Module *ThePrimaryInterface = nullptr;
+
   /// The explicit global module fragment of the current translation unit.
   /// The explicit Global Module Fragment, as specified in C++
   /// [module.global.frag].
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 56042e5fd252f..cd786049f914e 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1600,6 +1600,7 @@ Module *Decl::getOwningModuleForLinkage(bool IgnoreLinkage) const {
     return nullptr;
 
   case Module::ModuleInterfaceUnit:
+  case Module::ModuleImplementationUnit:
   case Module::ModulePartitionInterface:
   case Module::ModulePartitionImplementation:
     return M;
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index 0d0b5707e605a..9d7284cd0e37d 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -880,9 +880,11 @@ CodeGenModule::EmitCXXGlobalInitFunc() {
 
   // Include the filename in the symbol name. Including "sub_" matches gcc
   // and makes sure these symbols appear lexicographically behind the symbols
-  // with priority emitted above.
+  // with priority emitted above.  Module implementation units behave the same
+  // way as a non-modular TU with imports.
   llvm::Function *Fn;
-  if (CXX20ModuleInits && getContext().getNamedModuleForCodeGen()) {
+  if (CXX20ModuleInits && getContext().getNamedModuleForCodeGen() &&
+      !getContext().getNamedModuleForCodeGen()->isModuleImplementation()) {
     SmallString<256> InitFnName;
     llvm::raw_svector_ostream Out(InitFnName);
     cast<ItaniumMangleContext>(getCXXABI().getMangleContext())
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 0e33e9632b3eb..bd1ee2a674abb 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -548,6 +548,8 @@ void CodeGenModule::Release() {
     GlobalTopLevelStmtBlockInFlight = {nullptr, nullptr};
   }
 
+  // Module implementations are initialized the same way as a regular TU that
+  // imports one or more modules.
   if (CXX20ModuleInits && Primary && Primary->isInterfaceOrPartition())
     EmitCXXModuleInitFunc(Primary);
   else
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index 2aae41fe488ff..05d9fc8208b26 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -759,6 +759,8 @@ static StringRef ModuleKindName(Module::ModuleKind MK) {
     return "Module Map Module";
   case Module::ModuleInterfaceUnit:
     return "Interface Unit";
+  case Module::ModuleImplementationUnit:
+    return "Implementation Unit";
   case Module::ModulePartitionInterface:
     return "Partition Interface";
   case Module::ModulePartitionImplementation:
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 8dead93b03734..f2b2d0b8c69f1 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -888,23 +888,30 @@ ModuleMap::createPrivateModuleFragmentForInterfaceUnit(Module *Parent,
   return Result;
 }
 
-Module *ModuleMap::createModuleForInterfaceUnit(SourceLocation Loc,
-                                                StringRef Name) {
-  assert(LangOpts.CurrentModule == Name && "module name mismatch");
-  assert(!Modules[Name] && "redefining existing module");
-
+Module *ModuleMap::createModuleUnitWithKind(SourceLocation Loc, StringRef Name,
+                                            Module::ModuleKind Kind) {
   auto *Result =
       new Module(Name, Loc, nullptr, /*IsFramework*/ false,
                  /*IsExplicit*/ false, NumCreatedModules++);
-  Result->Kind = Module::ModuleInterfaceUnit;
-  Modules[Name] = SourceModule = Result;
+  Result->Kind = Kind;
 
-  // Reparent the current global module fragment as a submodule of this module.
+  // Reparent any current global module fragment as a submodule of this module.
   for (auto &Submodule : PendingSubmodules) {
     Submodule->setParent(Result);
     Submodule.release(); // now owned by parent
   }
   PendingSubmodules.clear();
+  return Result;
+}
+
+Module *ModuleMap::createModuleForInterfaceUnit(SourceLocation Loc,
+                                                StringRef Name) {
+  assert(LangOpts.CurrentModule == Name && "module name mismatch");
+  assert(!Modules[Name] && "redefining existing module");
+
+  auto *Result =
+      createModuleUnitWithKind(Loc, Name, Module::ModuleInterfaceUnit);
+  Modules[Name] = SourceModule = Result;
 
   // Mark the main source file as being within the newly-created module so that
   // declarations and macros are properly visibility-restricted to it.
@@ -915,6 +922,25 @@ Module *ModuleMap::createModuleForInterfaceUnit(SourceLocation Loc,
   return Result;
 }
 
+Module *ModuleMap::createModuleForImplementationUnit(SourceLocation Loc,
+                                                     StringRef Name) {
+  assert(LangOpts.CurrentModule == Name && "module name mismatch");
+  // The interface for this implementation must exist and be loaded.
+  assert(Modules[Name] && Modules[Name]->Kind == Module::ModuleInterfaceUnit &&
+         "creating implementation module without an interface");
+
+  auto *Result =
+      createModuleUnitWithKind(Loc, Name, Module::ModuleImplementationUnit);
+  SourceModule = Result;
+
+  // Mark the main source file as being within the newly-created module so that
+  // declarations and macros are properly visibility-restricted to it.
+  auto *MainFile = SourceMgr.getFileEntryForID(SourceMgr.getMainFileID());
+  assert(MainFile && "no input file for module implementation");
+
+  return Result;
+}
+
 Module *ModuleMap::createHeaderUnit(SourceLocation Loc, StringRef Name,
                                     Module::Header H) {
   assert(LangOpts.CurrentModule == Name && "module name mismatch");
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 64034393344f0..dd001dba2b912 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -1661,13 +1661,19 @@ bool Sema::CheckRedeclarationModuleOwnership(NamedDecl *New, NamedDecl *Old) {
   if (NewM == OldM)
     return false;
 
-  // Partitions are part of the module, but a partition could import another
-  // module, so verify that the PMIs agree.
-  if (NewM && OldM &&
-      (NewM->isModulePartition() || OldM->isModulePartition()) &&
-      NewM->getPrimaryModuleInterfaceName() ==
-          OldM->getPrimaryModuleInterfaceName())
-    return false;
+  if (NewM && OldM) {
+    // A module implementation unit has visibility of the decls in its
+    // implicitly imported interface.
+    if (NewM->isModuleImplementation() && OldM == ThePrimaryInterface)
+      return false;
+
+    // Partitions are part of the module, but a partition could import another
+    // module, so verify that the PMIs agree.
+    if ((NewM->isModulePartition() || OldM->isModulePartition()) &&
+        NewM->getPrimaryModuleInterfaceName() ==
+            OldM->getPrimaryModuleInterfaceName())
+      return false;
+  }
 
   bool NewIsModuleInterface = NewM && NewM->isModulePurview();
   bool OldIsModuleInterface = OldM && OldM->isModulePurview();
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 8c120d278d634..c02b9d2ac25b0 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -298,8 +298,8 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
   const_cast<LangOptions&>(getLangOpts()).CurrentModule = ModuleName;
 
   auto &Map = PP.getHeaderSearchInfo().getModuleMap();
-  Module *Mod;
-
+  Module *Mod;                 // The module we are creating.
+  Module *Interface = nullptr; // The interface for an implementation.
   switch (MDK) {
   case ModuleDeclKind::Interface:
   case ModuleDeclKind::PartitionInterface: {
@@ -336,18 +336,19 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
     // we're building if `LangOpts.CurrentModule` equals to 'ModuleName'.
     // Change the value for `LangOpts.CurrentModule` temporarily to make the
     // module loader work properly.
-    const_cast<LangOptions&>(getLangOpts()).CurrentModule = "";
-    Mod = getModuleLoader().loadModule(ModuleLoc, {ModuleNameLoc},
-                                       Module::AllVisible,
-                                       /*IsInclusionDirective=*/false);
+    const_cast<LangOptions &>(getLangOpts()).CurrentModule = "";
+    Interface = getModuleLoader().loadModule(ModuleLoc, {ModuleNameLoc},
+                                             Module::AllVisible,
+                                             /*IsInclusionDirective=*/false);
     const_cast<LangOptions&>(getLangOpts()).CurrentModule = ModuleName;
 
-    if (!Mod) {
+    if (!Interface) {
       Diag(ModuleLoc, diag::err_module_not_defined) << ModuleName;
       // Create an empty module interface unit for error recovery.
       Mod = Map.createModuleForInterfaceUnit(ModuleLoc, ModuleName);
+    } else {
+      Mod = Map.createModuleForImplementationUnit(ModuleLoc, ModuleName);
     }
-
   } break;
 
   case ModuleDeclKind::PartitionImplementation:
@@ -386,19 +387,31 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
   // statements, so imports are allowed.
   ImportState = ModuleImportState::ImportAllowed;
 
-  // For an implementation, We already made an implicit import (its interface).
-  // Make and return the import decl to be added to the current TU.
-  if (MDK == ModuleDeclKind::Implementation) {
-    // Make the import decl for the interface.
-    ImportDecl *Import =
-        ImportDecl::Create(Context, CurContext, ModuleLoc, Mod, Path[0].second);
-    // and return it to be added.
+  getASTContext().setNamedModuleForCodeGen(Mod);
+
+  // We already potentially made an implicit import (in the case of a module
+  // implementation unit importing its interface).  Make this module visible
+  // and return the import decl to be added to the current TU.
+  if (Interface) {
+
+    VisibleModules.setVisible(Interface, ModuleLoc);
+
+    // Make the import decl for the interface in the impl module.
+    ImportDecl *Import = ImportDecl::Create(Context, CurContext, ModuleLoc,
+                                            Interface, Path[0].second);
+    CurContext->addDecl(Import);
+
+    // Sequence initialization of the imported module before that of the current
+    // module, if any.
+    Context.addModuleInitializer(ModuleScopes.back().Module, Import);
+    Mod->Imports.insert(Interface); // As if we imported it.
+    // Also save this as a shortcut to checking for decls in the interface
+    ThePrimaryInterface = Interface;
+    // If we made an implicit import of the module interface, then return the
+    // imported module decl.
     return ConvertDeclToDeclGroup(Import);
   }
 
-  getASTContext().setNamedModuleForCodeGen(Mod);
-
-  // FIXME: Create a ModuleDecl.
   return nullptr;
 }
 
@@ -424,19 +437,17 @@ Sema::ActOnPrivateModuleFragmentDecl(SourceLocation ModuleLoc,
     Diag(ModuleScopes.back().BeginLoc, diag::note_previous_definition);
     return nullptr;
 
-  case Module::ModuleInterfaceUnit:
-    break;
-  }
-
-  if (!ModuleScopes.back().ModuleInterface) {
+  case Module::ModuleImplementationUnit:
     Diag(PrivateLoc, diag::err_private_module_fragment_not_module_interface);
     Diag(ModuleScopes.back().BeginLoc,
          diag::note_not_module_interface_add_export)
         << FixItHint::CreateInsertion(ModuleScopes.back().BeginLoc, "export ");
     return nullptr;
+
+  case Module::ModuleInterfaceUnit:
+    break;
   }
 
-  // FIXME: Check this isn't a module interface partition.
   // FIXME: Check that this translation unit does not import any partitions;
   // such imports would violate [basic.link]/2's "shall be the only module unit"
   // restriction.
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 94160409c5f53..3e40812a9a0ba 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -2719,7 +2719,7 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) {
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_DEFINITION));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ID
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Parent
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Kind
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // Kind
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsFramework
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsExplicit
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsSystem
diff --git a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
index 1542e532c635a..487dbdef283ee 100644
--- a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
+++ b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
@@ -143,9 +143,6 @@ void use() {
   (void)&inline_var_exported;
   (void)&const_var_exported;
 
-  // CHECK: define {{.*}}@_ZL26used_static_module_linkagev
-  used_static_module_linkage();
-
   // CHECK: define linkonce_odr {{.*}}@_ZW6Module26used_inline_module_linkagev
   used_inline_module_linkage();
 
@@ -154,8 +151,12 @@ void use() {
 
   (void)&extern_var_module_linkage;
   (void)&inline_var_module_linkage;
+
+  // FIXME: Issue #61427 Internal-linkage declarations in the interface TU
+  // should not be not visible here.
   (void)&static_var_module_linkage; // FIXME: Should not be visible here.
-  (void)&const_var_module_linkage;
+
+  (void)&const_var_module_linkage; // FIXME: will be visible after P2788R0
 }
 
 //--- user.cpp
@@ -176,5 +177,6 @@ void use() {
   (void)&inline_var_exported;
   (void)&const_var_exported;
 
+  // Internal-linkage declarations are not visible here.
   // Module-linkage declarations are not visible here.
 }
diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm
index e04412ea08d4a..19761fb3359ce 100644
--- a/clang/test/CXX/module/basic/basic.link/p2.cppm
+++ b/clang/test/CXX/module/basic/basic.link/p2.cppm
@@ -39,19 +39,21 @@ void use() {
 }
 
 //--- M.cpp
-// expected-no-diagnostics
+
 module M;
 
-// FIXME: Use of internal linkage entities should be rejected.
 void use_from_module_impl() {
   external_linkage_fn();
   module_linkage_fn();
-  internal_linkage_fn();
+  internal_linkage_fn(); // expected-error {{no matching function for call to 'internal_linkage_fn'}}
   (void)external_linkage_class{};
   (void)module_linkage_class{};
-  (void)internal_linkage_class{};
   (void)external_linkage_var;
   (void)module_linkage_var;
+
+  // FIXME: Issue #61427 Internal-linkage declarations in the interface TU
+  // should not be not visible here.
+  (void)internal_linkage_class{};
   (void)internal_linkage_var;
 }
 
diff --git a/clang/test/CodeGenCXX/module-intializer.cpp b/clang/test/CodeGenCXX/module-intializer.cpp
index e5149401b467a..d365d180ac59d 100644
--- a/clang/test/CodeGenCXX/module-intializer.cpp
+++ b/clang/test/CodeGenCXX/module-intializer.cpp
@@ -18,17 +18,17 @@
 // RUN: -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-P
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M.cpp \
-// RUN: -fmodule-file=N.pcm -fmodule-file=O.pcm -fmodule-file=M-part.pcm \
+// RUN: -fmodule-file=N=N.pcm -fmodule-file=O=O.pcm -fmodule-file=M:Part=M-part.pcm \
 // RUN:    -emit-module-interface -o M.pcm
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M.pcm -S -emit-llvm \
 // RUN:  -o - | FileCheck %s --check-prefix=CHECK-M
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 useM.cpp \
-// RUN: -fmodule-file=M.pcm -S -emit-llvm  -o - \
+// RUN: -fmodule-file=M=M.pcm -S -emit-llvm  -o - \
 // RUN: | FileCheck %s --check-prefix=CHECK-USE
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M-impl.cpp \
-// RUN: -fmodule-file=M.pcm -S -emit-llvm  -o - \
+// RUN: -fmodule-file=M=M.pcm -S -emit-llvm  -o - \
 // RUN: | FileCheck %s --check-prefix=CHECK-IMPL
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 N.cpp -S -emit-llvm \
@@ -41,7 +41,7 @@
 // RUN:   -o - | FileCheck %s --check-prefix=CHECK-P
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M.cpp \
-// RUN:   -fmodule-file=N.pcm -fmodule-file=O.pcm -fmodule-file=M-part.pcm \
+// RUN:   -fmodule-file=N.pcm -fmodule-file=O=O.pcm -fmodule-file=M:Part=M-part.pcm \
 // RUN:   -S -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-M
 
 //--- N-h.h

From e54cdd058e223bd62840e901b8b462c011d2fae5 Mon Sep 17 00:00:00 2001
From: Job Noorman <jnoorman@igalia.com>
Date: Thu, 23 Mar 2023 12:48:03 +0000
Subject: [PATCH 116/208] [RISCV][clang][test] Fix missed test

c39dd7c1db97fa367cb6282067b74cd8e55ef09a missed the appropriate change
to clang/test/Driver/ricv-arch.c.
---
 clang/test/Driver/riscv-arch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index cbc1464cbcd6f..b13da106df778 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -206,7 +206,7 @@
 // RUN: %clang --target=riscv32-unknown-elf -march=unknown -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-STR %s
 // RV32-STR: error: invalid arch name 'unknown',
-// RV32-STR: string must begin with rv32{i,e,g} or rv64{i,g}
+// RV32-STR: string must begin with rv32{i,e,g} or rv64{i,e,g}
 
 // RUN: %clang --target=riscv32-unknown-elf -march=rv32q -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-LETTER %s

From d0e2a42853b19d415b84c0dab94e800081e4adc6 Mon Sep 17 00:00:00 2001
From: Job Noorman <jnoorman@igalia.com>
Date: Thu, 23 Mar 2023 13:01:22 +0000
Subject: [PATCH 117/208] [RISCV][test] Fix another missed test change from
 RV64E patch

c39dd7c1db97fa367cb6282067b74cd8e55ef09a missed a needed change to the
llvm-objdump test.
---
 llvm/test/tools/llvm-objdump/ELF/RISCV/riscv-attributes.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/tools/llvm-objdump/ELF/RISCV/riscv-attributes.s b/llvm/test/tools/llvm-objdump/ELF/RISCV/riscv-attributes.s
index 7c41b63dcbf86..d15b675450a90 100644
--- a/llvm/test/tools/llvm-objdump/ELF/RISCV/riscv-attributes.s
+++ b/llvm/test/tools/llvm-objdump/ELF/RISCV/riscv-attributes.s
@@ -31,7 +31,7 @@ vsetvli a3, a2, e8, m8, tu, mu
 .Lend:
 
 #--- invalid_arch.s
-# INVALID: string must begin with rv32{i,e,g} or rv64{i,g} 
+# INVALID: string must begin with rv32{i,e,g} or rv64{i,e,g}
 nop
 
 .section .riscv.attributes,"",@0x70000003

From 4738c5f0832f283f8822b7a5b8b8491a20425346 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Mar 2023 13:07:29 +0000
Subject: [PATCH 118/208] [X86] LowerVectorAllZero - early out for masked v2i64
 cases without PTEST. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e828fe4b9dd15..74e2a2b6fdc10 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24192,6 +24192,12 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
                        DAG.getConstant(0, DL, IntVT));
   }
 
+  // Without PTEST, a masked v2i64 or-reduction is not faster than
+  // scalarization.
+  bool UsePTEST = Subtarget.hasSSE41();
+  if (!UsePTEST && !Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
+    return SDValue();
+
   // Split down to 128/256-bit vector.
   unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
   while (VT.getSizeInBits() > TestSize) {
@@ -24200,18 +24206,12 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
     V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
   }
 
-  bool UsePTEST = Subtarget.hasSSE41();
   if (UsePTEST) {
     MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     V = DAG.getBitcast(TestVT, MaskBits(V));
     return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
   }
 
-  // Without PTEST, a masked v2i64 or-reduction is not faster than
-  // scalarization.
-  if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
-    return SDValue();
-
   V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
   V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
                   getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

From 3f2dbcc27dfaa7ab53a0318f2fc732f5ce144222 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Thu, 23 Mar 2023 08:47:55 +0900
Subject: [PATCH 119/208] [Bazel] Rework `//llvm:llvm-tblgen` and
 `//llvm/unittests:tablegen_tests`

`llvm/utils/TableGen/GlobalISel` should be exported.

FYI, after D144351,`tablegen_tests` behaved same
as `llvm-tblgen -print-records`.
It suceeded because stdin is `/dev/null`.
---
 .../llvm-project-overlay/llvm/BUILD.bazel     | 35 ++++++++++---------
 .../llvm/unittests/BUILD.bazel                |  2 +-
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 55064fba0bf88..eff9752b785f5 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -569,28 +569,18 @@ cc_library(
 )
 
 cc_library(
-    name = "tblgen",
-    alwayslink = True,
+    name = "LLVMTableGenGlobalISel",
     srcs = glob([
-        "utils/TableGen/*.cpp",
-        "utils/TableGen/*.inc",
         "utils/TableGen/GlobalISel/*.cpp",
-
-        # Some tablegen sources include headers from MC, so these have to be
-        # listed here. MC uses headers produced by tablegen, so it cannot be a
-        # regular dependency.
-        "include/llvm/MC/*.h",
-
+    ]) + [
+        "utils/TableGen/CodeGenInstruction.h",
+    ],
+    hdrs = glob([
         # We have to include these headers here as well as in the `hdrs` below
         # to allow the `.cpp` files to use file-relative-inclusion to find
         # them, even though consumers of this library use inclusion relative to
         # `utils/TableGen` with the `strip_includes_prefix` of this library.
         # This mixture appears to be incompatible with header modules.
-        "utils/TableGen/*.h",
-        "utils/TableGen/GlobalISel/*.h",
-    ]),
-    hdrs = glob([
-        "utils/TableGen/*.h",
         "utils/TableGen/GlobalISel/*.h",
     ]),
     copts = llvm_copts,
@@ -605,10 +595,23 @@ cc_library(
 
 cc_binary(
     name = "llvm-tblgen",
+    srcs = glob([
+        "utils/TableGen/*.cpp",
+        "utils/TableGen/*.inc",
+        "utils/TableGen/*.h",
+
+        # Some tablegen sources include headers from MC, so these have to be
+        # listed here. MC uses headers produced by tablegen, so it cannot be a
+        # regular dependency.
+        "include/llvm/MC/*.h",
+    ]),
     copts = llvm_copts,
     stamp = 0,
     deps = [
-        ":tblgen",
+        ":LLVMTableGenGlobalISel",
+        ":Support",
+        ":TableGen",
+        ":config",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index a37041af5e8dc..4996a0ac93ab7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -675,7 +675,7 @@ cc_test(
         ":automata_tables_gen",
         "//llvm:Support",
         "//llvm:TableGen",
-        "//llvm:tblgen",
+        "//llvm:LLVMTableGenGlobalISel",
         "//third-party/unittest:gmock",
         "//third-party/unittest:gtest",
         "//third-party/unittest:gtest_main",

From a7c574d0c10ff686cf06d50010d759eaa5a4747b Mon Sep 17 00:00:00 2001
From: Johannes de Fine Licht <johannes.definelicht@nextsilicon.com>
Date: Thu, 23 Mar 2023 14:22:15 +0100
Subject: [PATCH 120/208] [MLIR][LLVM] Move the LLVM inliner interface into a
 separate file.

A fully fledged LLVM inliner will require a lot of logic. Since
`LLVMDialect.cpp` is large enough as it is, preemptively outline the
inlining logic into a separate `.cpp` file. This will also allow us to
add a `DEBUG_TYPE` for debugging the inliner.

The name `LLVMInlining` was chosen over `LLVMInlinerInterface` to keep
the option open for exposing inlining functionality even when not
invoked through the `DialectInlinerInterface`.

Depends on D146616

Reviewed By: gysit

Differential Revision: https://reviews.llvm.org/D146628
---
 mlir/lib/Dialect/LLVMIR/CMakeLists.txt      |   1 +
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp  | 237 +-----------------
 mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp | 252 ++++++++++++++++++++
 mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.h   |  33 +++
 4 files changed, 289 insertions(+), 234 deletions(-)
 create mode 100644 mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
 create mode 100644 mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.h

diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
index ebfe0258e0793..7e631e7ddc802 100644
--- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_dialect_library(MLIRLLVMDialect
   IR/FunctionCallUtils.cpp
   IR/LLVMAttrs.cpp
   IR/LLVMDialect.cpp
+  IR/LLVMInlining.cpp
   IR/LLVMInterfaces.cpp
   IR/LLVMTypes.cpp
   IR/LLVMTypeSyntax.cpp
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index ca439ab8cc15e..428f50f674b26 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "LLVMInlining.h"
 #include "TypeDetail.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
@@ -22,7 +23,6 @@
 #include "mlir/IR/FunctionImplementation.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Matchers.h"
-#include "mlir/Transforms/InliningUtils.h"
 
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/TypeSwitch.h"
@@ -2777,237 +2777,6 @@ struct LLVMOpAsmDialectInterface : public OpAsmDialectInterface {
 };
 } // namespace
 
-//===----------------------------------------------------------------------===//
-// DialectInlinerInterface
-//===----------------------------------------------------------------------===//
-
-/// Check whether the given alloca is an input to a lifetime intrinsic,
-/// optionally passing through one or more casts on the way. This is not
-/// transitive through block arguments.
-static bool hasLifetimeMarkers(LLVM::AllocaOp allocaOp) {
-  SmallVector<Operation *> stack(allocaOp->getUsers().begin(),
-                                 allocaOp->getUsers().end());
-  while (!stack.empty()) {
-    Operation *op = stack.pop_back_val();
-    if (isa<LLVM::LifetimeStartOp, LLVM::LifetimeEndOp>(op))
-      return true;
-    if (isa<LLVM::BitcastOp>(op))
-      stack.append(op->getUsers().begin(), op->getUsers().end());
-  }
-  return false;
-}
-
-/// Move all alloca operations with a constant size in the former entry block of
-/// the newly inlined callee into the entry block of the caller, and insert
-/// lifetime intrinsics that limit their scope to the inlined blocks.
-static void moveConstantAllocasToEntryBlock(
-    iterator_range<Region::iterator> inlinedBlocks) {
-  Block *calleeEntryBlock = &(*inlinedBlocks.begin());
-  Block *callerEntryBlock = &(*calleeEntryBlock->getParent()->begin());
-  if (calleeEntryBlock == callerEntryBlock)
-    // Nothing to do.
-    return;
-  SmallVector<std::tuple<LLVM::AllocaOp, IntegerAttr, bool>> allocasToMove;
-  bool shouldInsertLifetimes = false;
-  // Conservatively only move alloca operations that are part of the entry block
-  // and do not inspect nested regions, since they may execute conditionally or
-  // have other unknown semantics.
-  for (auto allocaOp : calleeEntryBlock->getOps<LLVM::AllocaOp>()) {
-    IntegerAttr arraySize;
-    if (!matchPattern(allocaOp.getArraySize(), m_Constant(&arraySize)))
-      continue;
-    bool shouldInsertLifetime =
-        arraySize.getValue() != 0 && !hasLifetimeMarkers(allocaOp);
-    shouldInsertLifetimes |= shouldInsertLifetime;
-    allocasToMove.emplace_back(allocaOp, arraySize, shouldInsertLifetime);
-  }
-  if (allocasToMove.empty())
-    return;
-  OpBuilder builder(callerEntryBlock, callerEntryBlock->begin());
-  for (auto &[allocaOp, arraySize, shouldInsertLifetime] : allocasToMove) {
-    auto newConstant = builder.create<LLVM::ConstantOp>(
-        allocaOp->getLoc(), allocaOp.getArraySize().getType(), arraySize);
-    // Insert a lifetime start intrinsic where the alloca was before moving it.
-    if (shouldInsertLifetime) {
-      OpBuilder::InsertionGuard insertionGuard(builder);
-      builder.setInsertionPoint(allocaOp);
-      builder.create<LLVM::LifetimeStartOp>(
-          allocaOp.getLoc(), arraySize.getValue().getLimitedValue(),
-          allocaOp.getResult());
-    }
-    allocaOp->moveAfter(newConstant);
-    allocaOp.getArraySizeMutable().assign(newConstant.getResult());
-  }
-  if (!shouldInsertLifetimes)
-    return;
-  // Insert a lifetime end intrinsic before each return in the callee function.
-  for (Block &block : inlinedBlocks) {
-    if (!block.getTerminator()->hasTrait<OpTrait::ReturnLike>())
-      continue;
-    builder.setInsertionPoint(block.getTerminator());
-    for (auto &[allocaOp, arraySize, shouldInsertLifetime] : allocasToMove) {
-      if (!shouldInsertLifetime)
-        continue;
-      builder.create<LLVM::LifetimeEndOp>(
-          allocaOp.getLoc(), arraySize.getValue().getLimitedValue(),
-          allocaOp.getResult());
-    }
-  }
-}
-
-static Value handleByValArgument(OpBuilder &builder, Operation *callable,
-                                 Value argument,
-                                 NamedAttribute byValAttribute) {
-  auto func = cast<LLVM::LLVMFuncOp>(callable);
-  LLVM::MemoryEffectsAttr memoryEffects = func.getMemoryAttr();
-  // If there is no memory effects attribute, assume that the function is
-  // not read-only.
-  bool isReadOnly = memoryEffects &&
-                    memoryEffects.getArgMem() != ModRefInfo::ModRef &&
-                    memoryEffects.getArgMem() != ModRefInfo::Mod;
-  if (isReadOnly)
-    return argument;
-  // Resolve the pointee type and its size.
-  auto ptrType = cast<LLVM::LLVMPointerType>(argument.getType());
-  Type elementType = cast<TypeAttr>(byValAttribute.getValue()).getValue();
-  unsigned int typeSize =
-      DataLayout(callable->getParentOfType<DataLayoutOpInterface>())
-          .getTypeSize(elementType);
-  // Allocate the new value on the stack.
-  Value one = builder.create<LLVM::ConstantOp>(
-      func.getLoc(), builder.getI64Type(), builder.getI64IntegerAttr(1));
-  Value allocaOp =
-      builder.create<LLVM::AllocaOp>(func.getLoc(), ptrType, elementType, one);
-  // Copy the pointee to the newly allocated value.
-  Value copySize = builder.create<LLVM::ConstantOp>(
-      func.getLoc(), builder.getI64Type(), builder.getI64IntegerAttr(typeSize));
-  Value isVolatile = builder.create<LLVM::ConstantOp>(
-      func.getLoc(), builder.getI1Type(), builder.getBoolAttr(false));
-  builder.create<LLVM::MemcpyOp>(func.getLoc(), allocaOp, argument, copySize,
-                                 isVolatile);
-  return allocaOp;
-}
-
-namespace {
-struct LLVMInlinerInterface : public DialectInlinerInterface {
-  using DialectInlinerInterface::DialectInlinerInterface;
-
-  bool isLegalToInline(Operation *call, Operation *callable,
-                       bool wouldBeCloned) const final {
-    if (!wouldBeCloned)
-      return false;
-    auto callOp = dyn_cast<LLVM::CallOp>(call);
-    auto funcOp = dyn_cast<LLVM::LLVMFuncOp>(callable);
-    if (!callOp || !funcOp)
-      return false;
-    if (auto attrs = funcOp.getArgAttrs()) {
-      for (Attribute attr : *attrs) {
-        auto attrDict = cast<DictionaryAttr>(attr);
-        for (NamedAttribute attr : attrDict) {
-          if (attr.getName() == LLVMDialect::getByValAttrName())
-            continue;
-          // TODO: Handle all argument attributes;
-          return false;
-        }
-      }
-    }
-    // TODO: Handle result attributes;
-    if (funcOp.getResAttrs())
-      return false;
-    // TODO: Handle exceptions.
-    if (funcOp.getPersonality())
-      return false;
-    if (funcOp.getPassthrough()) {
-      // TODO: Used attributes should not be passthrough.
-      DenseSet<StringAttr> disallowed(
-          {StringAttr::get(funcOp->getContext(), "noduplicate"),
-           StringAttr::get(funcOp->getContext(), "noinline"),
-           StringAttr::get(funcOp->getContext(), "optnone"),
-           StringAttr::get(funcOp->getContext(), "presplitcoroutine"),
-           StringAttr::get(funcOp->getContext(), "returns_twice"),
-           StringAttr::get(funcOp->getContext(), "strictfp")});
-      if (llvm::any_of(*funcOp.getPassthrough(), [&](Attribute attr) {
-            auto stringAttr = dyn_cast<StringAttr>(attr);
-            if (!stringAttr)
-              return false;
-            return disallowed.contains(stringAttr);
-          }))
-        return false;
-    }
-    return true;
-  }
-
-  bool isLegalToInline(Region *, Region *, bool, IRMapping &) const final {
-    return true;
-  }
-
-  /// Conservative allowlist of operations supported so far.
-  bool isLegalToInline(Operation *op, Region *, bool, IRMapping &) const final {
-    if (isPure(op))
-      return true;
-    // Some attributes on memory operations require handling during
-    // inlining. Since this is not yet implemented, refuse to inline memory
-    // operations that have any of these attributes.
-    if (auto iface = dyn_cast<AliasAnalysisOpInterface>(op))
-      if (iface.getAliasScopesOrNull() || iface.getNoAliasScopesOrNull())
-        return false;
-    if (auto iface = dyn_cast<AccessGroupOpInterface>(op))
-      if (iface.getAccessGroupsOrNull())
-        return false;
-    return isa<LLVM::CallOp, LLVM::AllocaOp, LLVM::LifetimeStartOp,
-               LLVM::LifetimeEndOp, LLVM::LoadOp, LLVM::StoreOp>(op);
-  }
-
-  /// Handle the given inlined return by replacing it with a branch. This
-  /// overload is called when the inlined region has more than one block.
-  void handleTerminator(Operation *op, Block *newDest) const final {
-    // Only return needs to be handled here.
-    auto returnOp = dyn_cast<LLVM::ReturnOp>(op);
-    if (!returnOp)
-      return;
-
-    // Replace the return with a branch to the dest.
-    OpBuilder builder(op);
-    builder.create<LLVM::BrOp>(op->getLoc(), returnOp.getOperands(), newDest);
-    op->erase();
-  }
-
-  /// Handle the given inlined return by replacing the uses of the call with the
-  /// operands of the return. This overload is called when the inlined region
-  /// only contains one block.
-  void handleTerminator(Operation *op,
-                        ArrayRef<Value> valuesToRepl) const final {
-    // Return will be the only terminator present.
-    auto returnOp = cast<LLVM::ReturnOp>(op);
-
-    // Replace the values directly with the return operands.
-    assert(returnOp.getNumOperands() == valuesToRepl.size());
-    for (const auto &[dst, src] :
-         llvm::zip(valuesToRepl, returnOp.getOperands()))
-      dst.replaceAllUsesWith(src);
-  }
-
-  Value handleArgument(OpBuilder &builder, Operation *call, Operation *callable,
-                       Value argument, Type targetType,
-                       DictionaryAttr argumentAttrs) const final {
-    if (auto attr = argumentAttrs.getNamed(LLVMDialect::getByValAttrName()))
-      return handleByValArgument(builder, callable, argument, *attr);
-    return argument;
-  }
-
-  void processInlinedCallBlocks(
-      Operation *call,
-      iterator_range<Region::iterator> inlinedBlocks) const override {
-    // Alloca operations with a constant size that were in the entry block of
-    // the callee should be moved to the entry block of the caller, as this will
-    // fold into prologue/epilogue code during code generation.
-    // This is not implemented as a standalone pattern because we need to know
-    // which newly inlined block was previously the entry block of the callee.
-    moveConstantAllocasToEntryBlock(inlinedBlocks);
-  }
-};
-} // end anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // LLVMDialect initialization, type parsing, and registration.
 //===----------------------------------------------------------------------===//
@@ -3037,9 +2806,9 @@ void LLVMDialect::initialize() {
   // Support unknown operations because not all LLVM operations are registered.
   allowUnknownOperations();
   // clang-format off
-  addInterfaces<LLVMOpAsmDialectInterface,
-                LLVMInlinerInterface>();
+  addInterfaces<LLVMOpAsmDialectInterface>();
   // clang-format on
+  detail::addLLVMInlinerInterface(this);
 }
 
 #define GET_OP_CLASSES
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
new file mode 100644
index 0000000000000..8a399b9a5d030
--- /dev/null
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
@@ -0,0 +1,252 @@
+//===- LLVMInlining.cpp - LLVM inlining interface and logic -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Logic for inlining LLVM functions and the definition of the
+// LLVMInliningInterface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LLVMInlining.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Transforms/InliningUtils.h"
+
+using namespace mlir;
+
+/// Check whether the given alloca is an input to a lifetime intrinsic,
+/// optionally passing through one or more casts on the way. This is not
+/// transitive through block arguments.
+static bool hasLifetimeMarkers(LLVM::AllocaOp allocaOp) {
+  SmallVector<Operation *> stack(allocaOp->getUsers().begin(),
+                                 allocaOp->getUsers().end());
+  while (!stack.empty()) {
+    Operation *op = stack.pop_back_val();
+    if (isa<LLVM::LifetimeStartOp, LLVM::LifetimeEndOp>(op))
+      return true;
+    if (isa<LLVM::BitcastOp>(op))
+      stack.append(op->getUsers().begin(), op->getUsers().end());
+  }
+  return false;
+}
+
+/// Move all alloca operations with a constant size in the former entry block of
+/// the newly inlined callee into the entry block of the caller, and insert
+/// lifetime intrinsics that limit their scope to the inlined blocks.
+static void moveConstantAllocasToEntryBlock(
+    iterator_range<Region::iterator> inlinedBlocks) {
+  Block *calleeEntryBlock = &(*inlinedBlocks.begin());
+  Block *callerEntryBlock = &(*calleeEntryBlock->getParent()->begin());
+  if (calleeEntryBlock == callerEntryBlock)
+    // Nothing to do.
+    return;
+  SmallVector<std::tuple<LLVM::AllocaOp, IntegerAttr, bool>> allocasToMove;
+  bool shouldInsertLifetimes = false;
+  // Conservatively only move alloca operations that are part of the entry block
+  // and do not inspect nested regions, since they may execute conditionally or
+  // have other unknown semantics.
+  for (auto allocaOp : calleeEntryBlock->getOps<LLVM::AllocaOp>()) {
+    IntegerAttr arraySize;
+    if (!matchPattern(allocaOp.getArraySize(), m_Constant(&arraySize)))
+      continue;
+    bool shouldInsertLifetime =
+        arraySize.getValue() != 0 && !hasLifetimeMarkers(allocaOp);
+    shouldInsertLifetimes |= shouldInsertLifetime;
+    allocasToMove.emplace_back(allocaOp, arraySize, shouldInsertLifetime);
+  }
+  if (allocasToMove.empty())
+    return;
+  OpBuilder builder(callerEntryBlock, callerEntryBlock->begin());
+  for (auto &[allocaOp, arraySize, shouldInsertLifetime] : allocasToMove) {
+    auto newConstant = builder.create<LLVM::ConstantOp>(
+        allocaOp->getLoc(), allocaOp.getArraySize().getType(), arraySize);
+    // Insert a lifetime start intrinsic where the alloca was before moving it.
+    if (shouldInsertLifetime) {
+      OpBuilder::InsertionGuard insertionGuard(builder);
+      builder.setInsertionPoint(allocaOp);
+      builder.create<LLVM::LifetimeStartOp>(
+          allocaOp.getLoc(), arraySize.getValue().getLimitedValue(),
+          allocaOp.getResult());
+    }
+    allocaOp->moveAfter(newConstant);
+    allocaOp.getArraySizeMutable().assign(newConstant.getResult());
+  }
+  if (!shouldInsertLifetimes)
+    return;
+  // Insert a lifetime end intrinsic before each return in the callee function.
+  for (Block &block : inlinedBlocks) {
+    if (!block.getTerminator()->hasTrait<OpTrait::ReturnLike>())
+      continue;
+    builder.setInsertionPoint(block.getTerminator());
+    for (auto &[allocaOp, arraySize, shouldInsertLifetime] : allocasToMove) {
+      if (!shouldInsertLifetime)
+        continue;
+      builder.create<LLVM::LifetimeEndOp>(
+          allocaOp.getLoc(), arraySize.getValue().getLimitedValue(),
+          allocaOp.getResult());
+    }
+  }
+}
+
+static Value handleByValArgument(OpBuilder &builder, Operation *callable,
+                                 Value argument,
+                                 NamedAttribute byValAttribute) {
+  auto func = cast<LLVM::LLVMFuncOp>(callable);
+  LLVM::MemoryEffectsAttr memoryEffects = func.getMemoryAttr();
+  // If there is no memory effects attribute, assume that the function is
+  // not read-only.
+  bool isReadOnly = memoryEffects &&
+                    memoryEffects.getArgMem() != LLVM::ModRefInfo::ModRef &&
+                    memoryEffects.getArgMem() != LLVM::ModRefInfo::Mod;
+  if (isReadOnly)
+    return argument;
+  // Resolve the pointee type and its size.
+  auto ptrType = cast<LLVM::LLVMPointerType>(argument.getType());
+  Type elementType = cast<TypeAttr>(byValAttribute.getValue()).getValue();
+  unsigned int typeSize =
+      DataLayout(callable->getParentOfType<DataLayoutOpInterface>())
+          .getTypeSize(elementType);
+  // Allocate the new value on the stack.
+  Value one = builder.create<LLVM::ConstantOp>(
+      func.getLoc(), builder.getI64Type(), builder.getI64IntegerAttr(1));
+  Value allocaOp =
+      builder.create<LLVM::AllocaOp>(func.getLoc(), ptrType, elementType, one);
+  // Copy the pointee to the newly allocated value.
+  Value copySize = builder.create<LLVM::ConstantOp>(
+      func.getLoc(), builder.getI64Type(), builder.getI64IntegerAttr(typeSize));
+  Value isVolatile = builder.create<LLVM::ConstantOp>(
+      func.getLoc(), builder.getI1Type(), builder.getBoolAttr(false));
+  builder.create<LLVM::MemcpyOp>(func.getLoc(), allocaOp, argument, copySize,
+                                 isVolatile);
+  return allocaOp;
+}
+
+namespace {
+struct LLVMInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  bool isLegalToInline(Operation *call, Operation *callable,
+                       bool wouldBeCloned) const final {
+    if (!wouldBeCloned)
+      return false;
+    auto callOp = dyn_cast<LLVM::CallOp>(call);
+    auto funcOp = dyn_cast<LLVM::LLVMFuncOp>(callable);
+    if (!callOp || !funcOp)
+      return false;
+    if (auto attrs = funcOp.getArgAttrs()) {
+      for (Attribute attr : *attrs) {
+        auto attrDict = cast<DictionaryAttr>(attr);
+        for (NamedAttribute attr : attrDict) {
+          if (attr.getName() == LLVM::LLVMDialect::getByValAttrName())
+            continue;
+          // TODO: Handle all argument attributes;
+          return false;
+        }
+      }
+    }
+    // TODO: Handle result attributes;
+    if (funcOp.getResAttrs())
+      return false;
+    // TODO: Handle exceptions.
+    if (funcOp.getPersonality())
+      return false;
+    if (funcOp.getPassthrough()) {
+      // TODO: Used attributes should not be passthrough.
+      DenseSet<StringAttr> disallowed(
+          {StringAttr::get(funcOp->getContext(), "noduplicate"),
+           StringAttr::get(funcOp->getContext(), "noinline"),
+           StringAttr::get(funcOp->getContext(), "optnone"),
+           StringAttr::get(funcOp->getContext(), "presplitcoroutine"),
+           StringAttr::get(funcOp->getContext(), "returns_twice"),
+           StringAttr::get(funcOp->getContext(), "strictfp")});
+      if (llvm::any_of(*funcOp.getPassthrough(), [&](Attribute attr) {
+            auto stringAttr = dyn_cast<StringAttr>(attr);
+            if (!stringAttr)
+              return false;
+            return disallowed.contains(stringAttr);
+          }))
+        return false;
+    }
+    return true;
+  }
+
+  bool isLegalToInline(Region *, Region *, bool, IRMapping &) const final {
+    return true;
+  }
+
+  /// Conservative allowlist of operations supported so far.
+  bool isLegalToInline(Operation *op, Region *, bool, IRMapping &) const final {
+    if (isPure(op))
+      return true;
+    // Some attributes on memory operations require handling during
+    // inlining. Since this is not yet implemented, refuse to inline memory
+    // operations that have any of these attributes.
+    if (auto iface = dyn_cast<LLVM::AliasAnalysisOpInterface>(op))
+      if (iface.getAliasScopesOrNull() || iface.getNoAliasScopesOrNull())
+        return false;
+    if (auto iface = dyn_cast<LLVM::AccessGroupOpInterface>(op))
+      if (iface.getAccessGroupsOrNull())
+        return false;
+    return isa<LLVM::CallOp, LLVM::AllocaOp, LLVM::LifetimeStartOp,
+               LLVM::LifetimeEndOp, LLVM::LoadOp, LLVM::StoreOp>(op);
+  }
+
+  /// Handle the given inlined return by replacing it with a branch. This
+  /// overload is called when the inlined region has more than one block.
+  void handleTerminator(Operation *op, Block *newDest) const final {
+    // Only return needs to be handled here.
+    auto returnOp = dyn_cast<LLVM::ReturnOp>(op);
+    if (!returnOp)
+      return;
+
+    // Replace the return with a branch to the dest.
+    OpBuilder builder(op);
+    builder.create<LLVM::BrOp>(op->getLoc(), returnOp.getOperands(), newDest);
+    op->erase();
+  }
+
+  /// Handle the given inlined return by replacing the uses of the call with the
+  /// operands of the return. This overload is called when the inlined region
+  /// only contains one block.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Return will be the only terminator present.
+    auto returnOp = cast<LLVM::ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &[dst, src] :
+         llvm::zip(valuesToRepl, returnOp.getOperands()))
+      dst.replaceAllUsesWith(src);
+  }
+
+  Value handleArgument(OpBuilder &builder, Operation *call, Operation *callable,
+                       Value argument, Type targetType,
+                       DictionaryAttr argumentAttrs) const final {
+    if (auto attr =
+            argumentAttrs.getNamed(LLVM::LLVMDialect::getByValAttrName()))
+      return handleByValArgument(builder, callable, argument, *attr);
+    return argument;
+  }
+
+  void processInlinedCallBlocks(
+      Operation *call,
+      iterator_range<Region::iterator> inlinedBlocks) const override {
+    // Alloca operations with a constant size that were in the entry block of
+    // the callee should be moved to the entry block of the caller, as this will
+    // fold into prologue/epilogue code during code generation.
+    // This is not implemented as a standalone pattern because we need to know
+    // which newly inlined block was previously the entry block of the callee.
+    moveConstantAllocasToEntryBlock(inlinedBlocks);
+  }
+};
+
+} // end anonymous namespace
+
+void LLVM::detail::addLLVMInlinerInterface(LLVM::LLVMDialect *dialect) {
+  dialect->addInterfaces<LLVMInlinerInterface>();
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.h b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.h
new file mode 100644
index 0000000000000..c6f75d5657c3b
--- /dev/null
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.h
@@ -0,0 +1,33 @@
+//===- LLVMInlining.h - Registration of LLVMInlinerInterface ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Allows registering the LLVM DialectInlinerInterface with the LLVM dialect
+// during initialization.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DIALECT_LLVMIR_IR_LLVMINLINING_H
+#define DIALECT_LLVMIR_IR_LLVMINLINING_H
+
+namespace mlir {
+namespace LLVM {
+
+class LLVMDialect;
+
+namespace detail {
+
+/// Register the `LLVMInlinerInterface` implementation of
+/// `DialectInlinerInterface` with the LLVM dialect.
+void addLLVMInlinerInterface(LLVMDialect *dialect);
+
+} // namespace detail
+
+} // namespace LLVM
+} // namespace mlir
+
+#endif // DIALECT_LLVMIR_IR_LLVMINLINING_H

From 8482b238062ed7263facea9490f67119e00a037a Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Wed, 22 Mar 2023 17:37:15 +0100
Subject: [PATCH 121/208] [ADT] add ConcurrentHashtable class.

ConcurrentHashTable - is a resizeable concurrent hashtable.
The range of resizings is limited up to x2^32. The hashtable allows only concurrent insertions.

Concurrent hashtable is necessary for the D96035 patch.

Reviewed By: JDevlieghere

Differential Revision: https://reviews.llvm.org/D132455
---
 llvm/include/llvm/ADT/ConcurrentHashtable.h   | 395 ++++++++++++++++++
 llvm/unittests/ADT/CMakeLists.txt             |   1 +
 .../unittests/ADT/ConcurrentHashtableTest.cpp | 279 +++++++++++++
 3 files changed, 675 insertions(+)
 create mode 100644 llvm/include/llvm/ADT/ConcurrentHashtable.h
 create mode 100644 llvm/unittests/ADT/ConcurrentHashtableTest.cpp

diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
new file mode 100644
index 0000000000000..56344ab9b8411
--- /dev/null
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -0,0 +1,395 @@
+//===- ConcurrentHashtable.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_CONCURRENTHASHTABLE_H
+#define LLVM_ADT_CONCURRENTHASHTABLE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Parallel.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/xxhash.h"
+#include <atomic>
+#include <cstddef>
+#include <iomanip>
+#include <mutex>
+#include <sstream>
+#include <type_traits>
+
+namespace llvm {
+
+/// ConcurrentHashTable - is a resizeable concurrent hashtable.
+/// The number of resizings limited up to x2^32. This hashtable is
+/// useful to have efficient access to aggregate data(like strings,
+/// type descriptors...) and to keep only single copy of such
+/// an aggregate. The hashtable allows only concurrent insertions:
+///
+/// KeyDataTy* = insert ( const KeyTy& );
+///
+/// Data structure:
+///
+/// Inserted value KeyTy is mapped to 64-bit hash value ->
+///
+///          [------- 64-bit Hash value --------]
+///          [  StartEntryIndex ][ Bucket Index ]
+///                    |                |
+///              points to the     points to
+///              first probe       the bucket.
+///              position inside
+///              bucket entries
+///
+/// After initialization, all buckets have an initial size. During insertions,
+/// buckets might be extended to contain more entries. Each bucket can be
+/// independently resized and rehashed(no need to lock the whole table).
+/// Different buckets may have different sizes. If the single bucket is full
+/// then the bucket is resized.
+///
+/// BucketsArray keeps all buckets. Each bucket keeps an array of Entries
+/// (pointers to KeyDataTy) and another array of entries hashes:
+///
+/// BucketsArray[BucketIdx].Hashes[EntryIdx]:
+/// BucketsArray[BucketIdx].Entries[EntryIdx]:
+///
+/// [Bucket 0].Hashes -> [uint32_t][uint32_t]
+/// [Bucket 0].Entries -> [KeyDataTy*][KeyDataTy*]
+///
+/// [Bucket 1].Hashes -> [uint32_t][uint32_t][uint32_t][uint32_t]
+/// [Bucket 1].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*][KeyDataTy*]
+///                      .........................
+/// [Bucket N].Hashes -> [uint32_t][uint32_t][uint32_t]
+/// [Bucket N].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*]
+///
+/// ConcurrentHashTableByPtr uses an external thread-safe allocator to allocate
+/// KeyDataTy items.
+
+template <typename KeyTy, typename KeyDataTy, typename AllocatorTy>
+class ConcurrentHashTableInfoByPtr {
+public:
+  /// \returns Hash value for the specified \p Key.
+  static inline uint64_t getHashValue(const KeyTy &Key) {
+    return xxHash64(Key);
+  }
+
+  /// \returns true if both \p LHS and \p RHS are equal.
+  static inline bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
+    return LHS == RHS;
+  }
+
+  /// \returns key for the specified \p KeyData.
+  static inline const KeyTy &getKey(const KeyDataTy &KeyData) {
+    return KeyData.getKey();
+  }
+
+  /// \returns newly created object of KeyDataTy type.
+  static inline KeyDataTy *create(const KeyTy &Key, AllocatorTy &Allocator) {
+    return KeyDataTy::create(Key, Allocator);
+  }
+};
+
+template <typename KeyTy, typename KeyDataTy, typename AllocatorTy,
+          typename Info =
+              ConcurrentHashTableInfoByPtr<KeyTy, KeyDataTy, AllocatorTy>>
+class ConcurrentHashTableByPtr {
+public:
+  ConcurrentHashTableByPtr(
+      AllocatorTy &Allocator, size_t EstimatedSize = 100000,
+      size_t ThreadsNum = parallel::strategy.compute_thread_count(),
+      size_t InitialNumberOfBuckets = 128)
+      : MultiThreadAllocator(Allocator) {
+    assert((ThreadsNum > 0) && "ThreadsNum must be greater than 0");
+    assert((InitialNumberOfBuckets > 0) &&
+           "InitialNumberOfBuckets must be greater than 0");
+
+    constexpr size_t UINT64_BitsNum = sizeof(uint64_t) * 8;
+    constexpr size_t UINT32_BitsNum = sizeof(uint32_t) * 8;
+
+    NumberOfBuckets = ThreadsNum;
+
+    // Calculate number of buckets.
+    if (ThreadsNum > 1) {
+      NumberOfBuckets *= InitialNumberOfBuckets;
+      NumberOfBuckets *= std::max(
+          1,
+          countr_zero(PowerOf2Ceil(EstimatedSize / InitialNumberOfBuckets)) >>
+              2);
+    }
+    NumberOfBuckets = PowerOf2Ceil(NumberOfBuckets);
+
+    // Allocate buckets.
+    BucketsArray = std::make_unique<Bucket[]>(NumberOfBuckets);
+
+    InitialBucketSize = EstimatedSize / NumberOfBuckets;
+    InitialBucketSize = std::max((size_t)1, InitialBucketSize);
+    InitialBucketSize = PowerOf2Ceil(InitialBucketSize);
+
+    // Initialize each bucket.
+    for (size_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
+      HashesPtr Hashes = new ExtHashBitsTy[InitialBucketSize];
+      memset(Hashes, 0, sizeof(ExtHashBitsTy) * InitialBucketSize);
+
+      DataPtr Entries = new EntryDataTy[InitialBucketSize];
+      memset(Entries, 0, sizeof(EntryDataTy) * InitialBucketSize);
+
+      BucketsArray[Idx].Size = InitialBucketSize;
+      BucketsArray[Idx].Hashes = Hashes;
+      BucketsArray[Idx].Entries = Entries;
+    }
+
+    // Calculate masks.
+    HashMask = NumberOfBuckets - 1;
+
+    size_t LeadingZerosNumber = countl_zero(HashMask);
+    HashBitsNum = UINT64_BitsNum - LeadingZerosNumber;
+
+    // We keep only high 32-bits of hash value. So bucket size cannot
+    // exceed 2^32. Bucket size is always power of two.
+    MaxBucketSize = 1Ull << (std::min(UINT32_BitsNum, LeadingZerosNumber));
+
+    // Calculate mask for extended hash bits.
+    ExtHashMask = (NumberOfBuckets * MaxBucketSize) - 1;
+  }
+
+  virtual ~ConcurrentHashTableByPtr() {
+    // Deallocate buckets.
+    for (size_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
+      delete[] BucketsArray[Idx].Hashes;
+      delete[] BucketsArray[Idx].Entries;
+    }
+  }
+
+  /// Insert new value \p NewValue or return already existing entry.
+  ///
+  /// \returns entry and "true" if an entry is just inserted or
+  /// "false" if an entry already exists.
+  std::pair<KeyDataTy *, bool> insert(const KeyTy &NewValue) {
+    // Calculate bucket index.
+    uint64_t Hash = Info::getHashValue(NewValue);
+    Bucket &CurBucket = BucketsArray[getBucketIdx(Hash)];
+    uint32_t ExtHashBits = getExtHashBits(Hash);
+
+    // Lock bucket.
+    CurBucket.Guard.lock();
+
+    HashesPtr BucketHashes = CurBucket.Hashes;
+    DataPtr BucketEntries = CurBucket.Entries;
+    size_t CurEntryIdx = getStartIdx(ExtHashBits, CurBucket.Size);
+
+    while (true) {
+      uint32_t CurEntryHashBits = BucketHashes[CurEntryIdx];
+
+      if (CurEntryHashBits == 0 && BucketEntries[CurEntryIdx] == nullptr) {
+        // Found empty slot. Insert data.
+        KeyDataTy *NewData = Info::create(NewValue, MultiThreadAllocator);
+        BucketEntries[CurEntryIdx] = NewData;
+        BucketHashes[CurEntryIdx] = ExtHashBits;
+
+        CurBucket.NumberOfEntries++;
+        RehashBucket(CurBucket);
+
+        CurBucket.Guard.unlock();
+
+        return {NewData, true};
+      }
+
+      if (CurEntryHashBits == ExtHashBits) {
+        // Hash matched. Check value for equality.
+        KeyDataTy *EntryData = BucketEntries[CurEntryIdx];
+        if (Info::isEqual(Info::getKey(*EntryData), NewValue)) {
+          // Already existed entry matched with inserted data is found.
+          CurBucket.Guard.unlock();
+
+          return {EntryData, false};
+        }
+      }
+
+      CurEntryIdx++;
+      CurEntryIdx &= (CurBucket.Size - 1);
+    }
+
+    llvm_unreachable("Insertion error.");
+    return {};
+  }
+
+  /// Print information about current state of hash table structures.
+  void printStatistic(raw_ostream &OS) {
+    OS << "\n--- HashTable statistic:\n";
+    OS << "\nNumber of buckets = " << NumberOfBuckets;
+    OS << "\nInitial bucket size = " << InitialBucketSize;
+
+    uint64_t NumberOfNonEmptyBuckets = 0;
+    uint64_t NumberOfEntriesPlusEmpty = 0;
+    uint64_t OverallNumberOfEntries = 0;
+    uint64_t OverallSize = sizeof(*this) + NumberOfBuckets * sizeof(Bucket);
+
+    DenseMap<size_t, size_t> BucketSizesMap;
+
+    // For each bucket...
+    for (size_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
+      Bucket &CurBucket = BucketsArray[Idx];
+
+      BucketSizesMap[CurBucket.Size]++;
+
+      if (CurBucket.NumberOfEntries != 0)
+        NumberOfNonEmptyBuckets++;
+      NumberOfEntriesPlusEmpty += CurBucket.Size;
+      OverallNumberOfEntries += CurBucket.NumberOfEntries;
+      OverallSize +=
+          (sizeof(ExtHashBitsTy) + sizeof(EntryDataTy)) * CurBucket.Size;
+    }
+
+    OS << "\nOverall number of entries = " << OverallNumberOfEntries;
+    OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets;
+    for (auto &BucketSize : BucketSizesMap)
+      OS << "\n Number of buckets with size " << BucketSize.first << ": "
+         << BucketSize.second;
+
+    std::stringstream stream;
+    stream << std::fixed << std::setprecision(2)
+           << ((float)OverallNumberOfEntries / (float)NumberOfEntriesPlusEmpty);
+    std::string str = stream.str();
+
+    OS << "\nLoad factor = " << str;
+    OS << "\nOverall allocated size = " << OverallSize;
+  }
+
+protected:
+  using ExtHashBitsTy = uint32_t;
+  using EntryDataTy = KeyDataTy *;
+
+  using HashesPtr = ExtHashBitsTy *;
+  using DataPtr = EntryDataTy *;
+
+  // Bucket structure. Keeps bucket data.
+  struct Bucket {
+    Bucket() = default;
+
+    // Size of bucket.
+    uint32_t Size = 0;
+
+    // Number of non-null entries.
+    size_t NumberOfEntries = 0;
+
+    // Hashes for [Size] entries.
+    HashesPtr Hashes = nullptr;
+
+    // [Size] entries.
+    DataPtr Entries = nullptr;
+
+    // Mutex for this bucket.
+    std::mutex Guard;
+  };
+
+  // Reallocate and rehash bucket if this is full enough.
+  void RehashBucket(Bucket &CurBucket) {
+    assert((CurBucket.Size > 0) && "Uninitialised bucket");
+    if (CurBucket.NumberOfEntries < CurBucket.Size * 0.9)
+      return;
+
+    if (CurBucket.Size >= MaxBucketSize)
+      report_fatal_error("ConcurrentHashTable is full");
+
+    size_t NewBucketSize = CurBucket.Size << 1;
+    assert((NewBucketSize <= MaxBucketSize) && "New bucket size is too big");
+    assert((CurBucket.Size < NewBucketSize) &&
+           "New bucket size less than size of current bucket");
+
+    // Store old entries & hashes arrays.
+    HashesPtr SrcHashes = CurBucket.Hashes;
+    DataPtr SrcEntries = CurBucket.Entries;
+
+    // Allocate new entries&hashes arrays.
+    HashesPtr DestHashes = new ExtHashBitsTy[NewBucketSize];
+    memset(DestHashes, 0, sizeof(ExtHashBitsTy) * NewBucketSize);
+
+    DataPtr DestEntries = new EntryDataTy[NewBucketSize];
+    memset(DestEntries, 0, sizeof(EntryDataTy) * NewBucketSize);
+
+    // For each entry in source arrays...
+    for (size_t CurSrcEntryIdx = 0; CurSrcEntryIdx < CurBucket.Size;
+         CurSrcEntryIdx++) {
+      uint32_t CurSrcEntryHashBits = SrcHashes[CurSrcEntryIdx];
+
+      // Check for null entry.
+      if (CurSrcEntryHashBits == 0 && SrcEntries[CurSrcEntryIdx] == nullptr)
+        continue;
+
+      size_t StartDestIdx = getStartIdx(CurSrcEntryHashBits, NewBucketSize);
+
+      // Insert non-null entry into the new arrays.
+      while (true) {
+        uint32_t CurDestEntryHashBits = DestHashes[StartDestIdx];
+
+        if (CurDestEntryHashBits == 0 && DestEntries[StartDestIdx] == nullptr) {
+          // Found empty slot. Insert data.
+          DestHashes[StartDestIdx] = CurSrcEntryHashBits;
+          DestEntries[StartDestIdx] = SrcEntries[CurSrcEntryIdx];
+          break;
+        }
+
+        StartDestIdx++;
+        StartDestIdx = StartDestIdx & (NewBucketSize - 1);
+      }
+    }
+
+    // Update bucket fields.
+    CurBucket.Hashes = DestHashes;
+    CurBucket.Entries = DestEntries;
+    CurBucket.Size = NewBucketSize;
+
+    // Delete old bucket entries.
+    if (SrcHashes != nullptr)
+      delete[] SrcHashes;
+    if (SrcEntries != nullptr)
+      delete[] SrcEntries;
+  }
+
+  size_t getBucketIdx(hash_code Hash) { return Hash & HashMask; }
+
+  uint32_t getExtHashBits(uint64_t Hash) {
+    return (Hash & ExtHashMask) >> HashBitsNum;
+  }
+
+  size_t getStartIdx(uint32_t ExtHashBits, size_t BucketSize) {
+    assert((BucketSize > 0) && "Empty bucket");
+
+    return ExtHashBits & (BucketSize - 1);
+  }
+
+  // Number of bits in hash mask.
+  uint64_t HashBitsNum = 0;
+
+  // Hash mask.
+  uint64_t HashMask = 0;
+
+  // Hash mask for the extended hash bits.
+  uint64_t ExtHashMask = 0;
+
+  // The maximal bucket size.
+  size_t MaxBucketSize = 0;
+
+  // Initial size of bucket.
+  size_t InitialBucketSize = 0;
+
+  // The number of buckets.
+  size_t NumberOfBuckets = 0;
+
+  // Array of buckets.
+  std::unique_ptr<Bucket[]> BucketsArray;
+
+  // Used for allocating KeyDataTy values.
+  AllocatorTy &MultiThreadAllocator;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_ADT_CONCURRENTHASHTABLE_H
diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index 900294d4216ee..c5190255ba773 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_unittest(ADTTests
   BumpPtrListTest.cpp
   CoalescingBitVectorTest.cpp
   CombinationGeneratorTest.cpp
+  ConcurrentHashtableTest.cpp
   DAGDeltaAlgorithmTest.cpp
   DeltaAlgorithmTest.cpp
   DenseMapTest.cpp
diff --git a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
new file mode 100644
index 0000000000000..c4faece251790
--- /dev/null
+++ b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
@@ -0,0 +1,279 @@
+//===- ConcurrentHashtableTest.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ConcurrentHashtable.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Parallel.h"
+#include "gtest/gtest.h"
+#include <limits>
+#include <random>
+#include <vector>
+using namespace llvm;
+
+namespace {
+class String {
+public:
+  String() {}
+  const std::string &getKey() const { return Data; }
+
+  template <typename AllocatorTy>
+  static String *create(const std::string &Num, AllocatorTy &Allocator) {
+    String *Result = Allocator.template Allocate<String>();
+    new (Result) String(Num);
+    return Result;
+  }
+
+protected:
+  String(const std::string &Num) { Data += Num; }
+
+  std::string Data;
+  std::array<char, 0x20> ExtraData;
+};
+
+static LLVM_THREAD_LOCAL BumpPtrAllocator ThreadLocalAllocator;
+class PerThreadAllocator : public AllocatorBase<PerThreadAllocator> {
+public:
+  inline LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size,
+                                                       size_t Alignment) {
+    return ThreadLocalAllocator.Allocate(Size, Align(Alignment));
+  }
+  inline size_t getBytesAllocated() const {
+    return ThreadLocalAllocator.getBytesAllocated();
+  }
+
+  // Pull in base class overloads.
+  using AllocatorBase<PerThreadAllocator>::Allocate;
+} Allocator;
+
+TEST(ConcurrentHashTableTest, AddStringEntries) {
+  ConcurrentHashTableByPtr<
+      std::string, String, PerThreadAllocator,
+      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
+      HashTable(Allocator, 10);
+
+  size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+  std::pair<String *, bool> res1 = HashTable.insert("1");
+  // Check entry is inserted.
+  EXPECT_TRUE(res1.first->getKey() == "1");
+  EXPECT_TRUE(res1.second);
+
+  std::pair<String *, bool> res2 = HashTable.insert("2");
+  // Check old entry is still valid.
+  EXPECT_TRUE(res1.first->getKey() == "1");
+  // Check new entry is inserted.
+  EXPECT_TRUE(res2.first->getKey() == "2");
+  EXPECT_TRUE(res2.second);
+  // Check new and old entries use different memory.
+  EXPECT_TRUE(res1.first != res2.first);
+
+  std::pair<String *, bool> res3 = HashTable.insert("3");
+  // Check one more entry is inserted.
+  EXPECT_TRUE(res3.first->getKey() == "3");
+  EXPECT_TRUE(res3.second);
+
+  std::pair<String *, bool> res4 = HashTable.insert("1");
+  // Check duplicated entry is inserted.
+  EXPECT_TRUE(res4.first->getKey() == "1");
+  EXPECT_FALSE(res4.second);
+  // Check duplicated entry uses the same memory.
+  EXPECT_TRUE(res1.first == res4.first);
+
+  // Check first entry is still valid.
+  EXPECT_TRUE(res1.first->getKey() == "1");
+
+  // Check data was allocated by allocator.
+  EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
+
+  // Check statistic.
+  std::string StatisticString;
+  raw_string_ostream StatisticStream(StatisticString);
+  HashTable.printStatistic(StatisticStream);
+
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 3\n") !=
+              std::string::npos);
+}
+
+TEST(ConcurrentHashTableTest, AddStringMultiplueEntries) {
+  const size_t NumElements = 10000;
+  ConcurrentHashTableByPtr<
+      std::string, String, PerThreadAllocator,
+      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
+      HashTable(Allocator);
+
+  // Check insertion.
+  for (size_t I = 0; I < NumElements; I++) {
+    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+    std::string StringForElement = formatv("{0}", I);
+    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
+    EXPECT_TRUE(Entry.second);
+    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
+    EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
+  }
+
+  std::string StatisticString;
+  raw_string_ostream StatisticStream(StatisticString);
+  HashTable.printStatistic(StatisticStream);
+
+  // Verifying that the table contains exactly the number of elements we
+  // inserted.
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 10000\n") !=
+              std::string::npos);
+
+  // Check insertion of duplicates.
+  for (size_t I = 0; I < NumElements; I++) {
+    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+    std::string StringForElement = formatv("{0}", I);
+    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
+    EXPECT_FALSE(Entry.second);
+    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
+    // Check no additional bytes were allocated for duplicate.
+    EXPECT_TRUE(Allocator.getBytesAllocated() == AllocatedBytesAtStart);
+  }
+
+  // Check statistic.
+  // Verifying that the table contains exactly the number of elements we
+  // inserted.
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 10000\n") !=
+              std::string::npos);
+}
+
+TEST(ConcurrentHashTableTest, AddStringMultiplueEntriesWithResize) {
+  // Number of elements exceeds original size, thus hashtable should be resized.
+  const size_t NumElements = 20000;
+  ConcurrentHashTableByPtr<
+      std::string, String, PerThreadAllocator,
+      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
+      HashTable(Allocator, 100);
+
+  // Check insertion.
+  for (size_t I = 0; I < NumElements; I++) {
+    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+    std::string StringForElement = formatv("{0} {1}", I, I + 100);
+    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
+    EXPECT_TRUE(Entry.second);
+    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
+    EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
+  }
+
+  std::string StatisticString;
+  raw_string_ostream StatisticStream(StatisticString);
+  HashTable.printStatistic(StatisticStream);
+
+  // Verifying that the table contains exactly the number of elements we
+  // inserted.
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 20000\n") !=
+              std::string::npos);
+
+  // Check insertion of duplicates.
+  for (size_t I = 0; I < NumElements; I++) {
+    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+    std::string StringForElement = formatv("{0} {1}", I, I + 100);
+    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
+    EXPECT_FALSE(Entry.second);
+    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
+    // Check no additional bytes were allocated for duplicate.
+    EXPECT_TRUE(Allocator.getBytesAllocated() == AllocatedBytesAtStart);
+  }
+
+  // Check statistic.
+  // Verifying that the table contains exactly the number of elements we
+  // inserted.
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 20000\n") !=
+              std::string::npos);
+}
+
+TEST(ConcurrentHashTableTest, AddStringEntriesParallel) {
+  const size_t NumElements = 10000;
+  ConcurrentHashTableByPtr<
+      std::string, String, PerThreadAllocator,
+      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
+      HashTable(Allocator);
+
+  // Check parallel insertion.
+  parallelFor(0, NumElements, [&](size_t I) {
+    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+    std::string StringForElement = formatv("{0}", I);
+    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
+    EXPECT_TRUE(Entry.second);
+    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
+    EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
+  });
+
+  std::string StatisticString;
+  raw_string_ostream StatisticStream(StatisticString);
+  HashTable.printStatistic(StatisticStream);
+
+  // Verifying that the table contains exactly the number of elements we
+  // inserted.
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 10000\n") !=
+              std::string::npos);
+
+  // Check parallel insertion of duplicates.
+  parallelFor(0, NumElements, [&](size_t I) {
+    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+    std::string StringForElement = formatv("{0}", I);
+    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
+    EXPECT_FALSE(Entry.second);
+    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
+    // Check no additional bytes were allocated for duplicate.
+    EXPECT_TRUE(Allocator.getBytesAllocated() == AllocatedBytesAtStart);
+  });
+
+  // Check statistic.
+  // Verifying that the table contains exactly the number of elements we
+  // inserted.
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 10000\n") !=
+              std::string::npos);
+}
+
+TEST(ConcurrentHashTableTest, AddStringEntriesParallelWithResize) {
+  const size_t NumElements = 20000;
+  ConcurrentHashTableByPtr<
+      std::string, String, PerThreadAllocator,
+      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
+      HashTable(Allocator, 100);
+
+  // Check parallel insertion.
+  parallelFor(0, NumElements, [&](size_t I) {
+    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+    std::string StringForElement = formatv("{0}", I);
+    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
+    EXPECT_TRUE(Entry.second);
+    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
+    EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
+  });
+
+  std::string StatisticString;
+  raw_string_ostream StatisticStream(StatisticString);
+  HashTable.printStatistic(StatisticStream);
+
+  // Verifying that the table contains exactly the number of elements we
+  // inserted.
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 20000\n") !=
+              std::string::npos);
+
+  // Check parallel insertion of duplicates.
+  parallelFor(0, NumElements, [&](size_t I) {
+    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
+    std::string StringForElement = formatv("{0}", I);
+    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
+    EXPECT_FALSE(Entry.second);
+    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
+    // Check no additional bytes were allocated for duplicate.
+    EXPECT_TRUE(Allocator.getBytesAllocated() == AllocatedBytesAtStart);
+  });
+
+  // Check statistic.
+  // Verifying that the table contains exactly the number of elements we
+  // inserted.
+  EXPECT_TRUE(StatisticString.find("Overall number of entries = 20000\n") !=
+              std::string::npos);
+}
+
+} // namespace

From 320969f5058bfffd6517c36771b46ac4a447c7ee Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Mar 2023 13:37:04 +0000
Subject: [PATCH 122/208] [X86] LowerVectorAllZero - add 512-bit support with
 AVX512 vptestnmd+kortestw patterns (REAPPLIED)

Another step toward #53419 - this is also another step towards expanding MatchVectorAllZeroTest to match any pair of vectors and merge EmitAVX512Test into it.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 13 +++++-
 llvm/test/CodeGen/X86/ptest.ll                | 15 +++---
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 46 ++++++++-----------
 3 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 74e2a2b6fdc10..2d371566381c8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24194,18 +24194,27 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
 
   // Without PTEST, a masked v2i64 or-reduction is not faster than
   // scalarization.
+  bool UseKORTEST = Subtarget.useAVX512Regs();
   bool UsePTEST = Subtarget.hasSSE41();
   if (!UsePTEST && !Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
     return SDValue();
 
-  // Split down to 128/256-bit vector.
-  unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
+  // Split down to 128/256/512-bit vector.
+  unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
   while (VT.getSizeInBits() > TestSize) {
     auto Split = DAG.SplitVector(V, DL);
     VT = Split.first.getValueType();
     V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
   }
 
+  if (UseKORTEST && VT.is512BitVector()) {
+    V = DAG.getBitcast(MVT::v16i32, MaskBits(V));
+    V = DAG.getSetCC(DL, MVT::v16i1, V,
+                     getZeroVector(MVT::v16i32, Subtarget, DAG, DL),
+                     ISD::SETNE);
+    return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
+  }
+
   if (UsePTEST) {
     MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     V = DAG.getBitcast(TestVT, MaskBits(V));
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 066cbb6193317..bedcfebc5f6e7 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -148,9 +148,8 @@ define i32 @veccond512(<16 x i32> %input) {
 ;
 ; AVX512-LABEL: veccond512:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    je .LBB2_2
 ; AVX512-NEXT:  # %bb.1: # %if-true-block
 ; AVX512-NEXT:    xorl %eax, %eax
@@ -268,10 +267,9 @@ define i32 @vectest512(<16 x i32> %input) {
 ;
 ; AVX512-LABEL: vectest512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -380,9 +378,8 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
 ; AVX512-LABEL: vecsel512:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movl %edi, %eax
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    cmovel %esi, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index fcb0ab6090398..a489a5e6099f0 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -105,9 +105,8 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ;
 ; AVX512-LABEL: test_v8i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -169,9 +168,8 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-LABEL: test_v16i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -298,9 +296,8 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ;
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -362,9 +359,8 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -510,9 +506,8 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ;
 ; AVX512-LABEL: test_v32i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -574,9 +569,8 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-LABEL: test_v64i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -741,9 +735,8 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ;
 ; AVX512-LABEL: test_v64i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    setne %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -805,9 +798,8 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-LABEL: test_v128i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vptest %ymm0, %ymm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1014,10 +1006,8 @@ define i1 @mask_v128i8(<128 x i8> %a0) {
 ; AVX512-LABEL: mask_v128i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673]
-; AVX512-NEXT:    vptest %ymm1, %ymm0
+; AVX512-NEXT:    vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq

From 8c7c1f11ffaacf762e612c65440fd2cbb58ee426 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 23 Mar 2023 14:41:03 +0100
Subject: [PATCH 123/208] Silence unused variable warning in NDEBUG builds

I usually would fold this into the assert, but the comment there
suggests side effects. NFC.

ModuleMap.cpp:938:9: error: unused variable 'MainFile' [-Werror,-Wunused-variable]
  auto *MainFile = SourceMgr.getFileEntryForID(SourceMgr.getMainFileID());
---
 clang/lib/Lex/ModuleMap.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index f2b2d0b8c69f1..44c872336ce9c 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -936,6 +936,7 @@ Module *ModuleMap::createModuleForImplementationUnit(SourceLocation Loc,
   // Mark the main source file as being within the newly-created module so that
   // declarations and macros are properly visibility-restricted to it.
   auto *MainFile = SourceMgr.getFileEntryForID(SourceMgr.getMainFileID());
+  (void)MainFile;
   assert(MainFile && "no input file for module implementation");
 
   return Result;

From fd4aeba307ca30da00a8db21a200cc9afcef63c6 Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Thu, 23 Mar 2023 14:40:29 +0100
Subject: [PATCH 124/208] Revert "[ADT] add ConcurrentHashtable class."

This reverts commit 8482b238062ed7263facea9490f67119e00a037a.
---
 llvm/include/llvm/ADT/ConcurrentHashtable.h   | 395 ------------------
 llvm/unittests/ADT/CMakeLists.txt             |   1 -
 .../unittests/ADT/ConcurrentHashtableTest.cpp | 279 -------------
 3 files changed, 675 deletions(-)
 delete mode 100644 llvm/include/llvm/ADT/ConcurrentHashtable.h
 delete mode 100644 llvm/unittests/ADT/ConcurrentHashtableTest.cpp

diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
deleted file mode 100644
index 56344ab9b8411..0000000000000
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ /dev/null
@@ -1,395 +0,0 @@
-//===- ConcurrentHashtable.h ------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ADT_CONCURRENTHASHTABLE_H
-#define LLVM_ADT_CONCURRENTHASHTABLE_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Parallel.h"
-#include "llvm/Support/WithColor.h"
-#include "llvm/Support/xxhash.h"
-#include <atomic>
-#include <cstddef>
-#include <iomanip>
-#include <mutex>
-#include <sstream>
-#include <type_traits>
-
-namespace llvm {
-
-/// ConcurrentHashTable - is a resizeable concurrent hashtable.
-/// The number of resizings limited up to x2^32. This hashtable is
-/// useful to have efficient access to aggregate data(like strings,
-/// type descriptors...) and to keep only single copy of such
-/// an aggregate. The hashtable allows only concurrent insertions:
-///
-/// KeyDataTy* = insert ( const KeyTy& );
-///
-/// Data structure:
-///
-/// Inserted value KeyTy is mapped to 64-bit hash value ->
-///
-///          [------- 64-bit Hash value --------]
-///          [  StartEntryIndex ][ Bucket Index ]
-///                    |                |
-///              points to the     points to
-///              first probe       the bucket.
-///              position inside
-///              bucket entries
-///
-/// After initialization, all buckets have an initial size. During insertions,
-/// buckets might be extended to contain more entries. Each bucket can be
-/// independently resized and rehashed(no need to lock the whole table).
-/// Different buckets may have different sizes. If the single bucket is full
-/// then the bucket is resized.
-///
-/// BucketsArray keeps all buckets. Each bucket keeps an array of Entries
-/// (pointers to KeyDataTy) and another array of entries hashes:
-///
-/// BucketsArray[BucketIdx].Hashes[EntryIdx]:
-/// BucketsArray[BucketIdx].Entries[EntryIdx]:
-///
-/// [Bucket 0].Hashes -> [uint32_t][uint32_t]
-/// [Bucket 0].Entries -> [KeyDataTy*][KeyDataTy*]
-///
-/// [Bucket 1].Hashes -> [uint32_t][uint32_t][uint32_t][uint32_t]
-/// [Bucket 1].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*][KeyDataTy*]
-///                      .........................
-/// [Bucket N].Hashes -> [uint32_t][uint32_t][uint32_t]
-/// [Bucket N].Entries -> [KeyDataTy*][KeyDataTy*][KeyDataTy*]
-///
-/// ConcurrentHashTableByPtr uses an external thread-safe allocator to allocate
-/// KeyDataTy items.
-
-template <typename KeyTy, typename KeyDataTy, typename AllocatorTy>
-class ConcurrentHashTableInfoByPtr {
-public:
-  /// \returns Hash value for the specified \p Key.
-  static inline uint64_t getHashValue(const KeyTy &Key) {
-    return xxHash64(Key);
-  }
-
-  /// \returns true if both \p LHS and \p RHS are equal.
-  static inline bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
-    return LHS == RHS;
-  }
-
-  /// \returns key for the specified \p KeyData.
-  static inline const KeyTy &getKey(const KeyDataTy &KeyData) {
-    return KeyData.getKey();
-  }
-
-  /// \returns newly created object of KeyDataTy type.
-  static inline KeyDataTy *create(const KeyTy &Key, AllocatorTy &Allocator) {
-    return KeyDataTy::create(Key, Allocator);
-  }
-};
-
-template <typename KeyTy, typename KeyDataTy, typename AllocatorTy,
-          typename Info =
-              ConcurrentHashTableInfoByPtr<KeyTy, KeyDataTy, AllocatorTy>>
-class ConcurrentHashTableByPtr {
-public:
-  ConcurrentHashTableByPtr(
-      AllocatorTy &Allocator, size_t EstimatedSize = 100000,
-      size_t ThreadsNum = parallel::strategy.compute_thread_count(),
-      size_t InitialNumberOfBuckets = 128)
-      : MultiThreadAllocator(Allocator) {
-    assert((ThreadsNum > 0) && "ThreadsNum must be greater than 0");
-    assert((InitialNumberOfBuckets > 0) &&
-           "InitialNumberOfBuckets must be greater than 0");
-
-    constexpr size_t UINT64_BitsNum = sizeof(uint64_t) * 8;
-    constexpr size_t UINT32_BitsNum = sizeof(uint32_t) * 8;
-
-    NumberOfBuckets = ThreadsNum;
-
-    // Calculate number of buckets.
-    if (ThreadsNum > 1) {
-      NumberOfBuckets *= InitialNumberOfBuckets;
-      NumberOfBuckets *= std::max(
-          1,
-          countr_zero(PowerOf2Ceil(EstimatedSize / InitialNumberOfBuckets)) >>
-              2);
-    }
-    NumberOfBuckets = PowerOf2Ceil(NumberOfBuckets);
-
-    // Allocate buckets.
-    BucketsArray = std::make_unique<Bucket[]>(NumberOfBuckets);
-
-    InitialBucketSize = EstimatedSize / NumberOfBuckets;
-    InitialBucketSize = std::max((size_t)1, InitialBucketSize);
-    InitialBucketSize = PowerOf2Ceil(InitialBucketSize);
-
-    // Initialize each bucket.
-    for (size_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
-      HashesPtr Hashes = new ExtHashBitsTy[InitialBucketSize];
-      memset(Hashes, 0, sizeof(ExtHashBitsTy) * InitialBucketSize);
-
-      DataPtr Entries = new EntryDataTy[InitialBucketSize];
-      memset(Entries, 0, sizeof(EntryDataTy) * InitialBucketSize);
-
-      BucketsArray[Idx].Size = InitialBucketSize;
-      BucketsArray[Idx].Hashes = Hashes;
-      BucketsArray[Idx].Entries = Entries;
-    }
-
-    // Calculate masks.
-    HashMask = NumberOfBuckets - 1;
-
-    size_t LeadingZerosNumber = countl_zero(HashMask);
-    HashBitsNum = UINT64_BitsNum - LeadingZerosNumber;
-
-    // We keep only high 32-bits of hash value. So bucket size cannot
-    // exceed 2^32. Bucket size is always power of two.
-    MaxBucketSize = 1Ull << (std::min(UINT32_BitsNum, LeadingZerosNumber));
-
-    // Calculate mask for extended hash bits.
-    ExtHashMask = (NumberOfBuckets * MaxBucketSize) - 1;
-  }
-
-  virtual ~ConcurrentHashTableByPtr() {
-    // Deallocate buckets.
-    for (size_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
-      delete[] BucketsArray[Idx].Hashes;
-      delete[] BucketsArray[Idx].Entries;
-    }
-  }
-
-  /// Insert new value \p NewValue or return already existing entry.
-  ///
-  /// \returns entry and "true" if an entry is just inserted or
-  /// "false" if an entry already exists.
-  std::pair<KeyDataTy *, bool> insert(const KeyTy &NewValue) {
-    // Calculate bucket index.
-    uint64_t Hash = Info::getHashValue(NewValue);
-    Bucket &CurBucket = BucketsArray[getBucketIdx(Hash)];
-    uint32_t ExtHashBits = getExtHashBits(Hash);
-
-    // Lock bucket.
-    CurBucket.Guard.lock();
-
-    HashesPtr BucketHashes = CurBucket.Hashes;
-    DataPtr BucketEntries = CurBucket.Entries;
-    size_t CurEntryIdx = getStartIdx(ExtHashBits, CurBucket.Size);
-
-    while (true) {
-      uint32_t CurEntryHashBits = BucketHashes[CurEntryIdx];
-
-      if (CurEntryHashBits == 0 && BucketEntries[CurEntryIdx] == nullptr) {
-        // Found empty slot. Insert data.
-        KeyDataTy *NewData = Info::create(NewValue, MultiThreadAllocator);
-        BucketEntries[CurEntryIdx] = NewData;
-        BucketHashes[CurEntryIdx] = ExtHashBits;
-
-        CurBucket.NumberOfEntries++;
-        RehashBucket(CurBucket);
-
-        CurBucket.Guard.unlock();
-
-        return {NewData, true};
-      }
-
-      if (CurEntryHashBits == ExtHashBits) {
-        // Hash matched. Check value for equality.
-        KeyDataTy *EntryData = BucketEntries[CurEntryIdx];
-        if (Info::isEqual(Info::getKey(*EntryData), NewValue)) {
-          // Already existed entry matched with inserted data is found.
-          CurBucket.Guard.unlock();
-
-          return {EntryData, false};
-        }
-      }
-
-      CurEntryIdx++;
-      CurEntryIdx &= (CurBucket.Size - 1);
-    }
-
-    llvm_unreachable("Insertion error.");
-    return {};
-  }
-
-  /// Print information about current state of hash table structures.
-  void printStatistic(raw_ostream &OS) {
-    OS << "\n--- HashTable statistic:\n";
-    OS << "\nNumber of buckets = " << NumberOfBuckets;
-    OS << "\nInitial bucket size = " << InitialBucketSize;
-
-    uint64_t NumberOfNonEmptyBuckets = 0;
-    uint64_t NumberOfEntriesPlusEmpty = 0;
-    uint64_t OverallNumberOfEntries = 0;
-    uint64_t OverallSize = sizeof(*this) + NumberOfBuckets * sizeof(Bucket);
-
-    DenseMap<size_t, size_t> BucketSizesMap;
-
-    // For each bucket...
-    for (size_t Idx = 0; Idx < NumberOfBuckets; Idx++) {
-      Bucket &CurBucket = BucketsArray[Idx];
-
-      BucketSizesMap[CurBucket.Size]++;
-
-      if (CurBucket.NumberOfEntries != 0)
-        NumberOfNonEmptyBuckets++;
-      NumberOfEntriesPlusEmpty += CurBucket.Size;
-      OverallNumberOfEntries += CurBucket.NumberOfEntries;
-      OverallSize +=
-          (sizeof(ExtHashBitsTy) + sizeof(EntryDataTy)) * CurBucket.Size;
-    }
-
-    OS << "\nOverall number of entries = " << OverallNumberOfEntries;
-    OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets;
-    for (auto &BucketSize : BucketSizesMap)
-      OS << "\n Number of buckets with size " << BucketSize.first << ": "
-         << BucketSize.second;
-
-    std::stringstream stream;
-    stream << std::fixed << std::setprecision(2)
-           << ((float)OverallNumberOfEntries / (float)NumberOfEntriesPlusEmpty);
-    std::string str = stream.str();
-
-    OS << "\nLoad factor = " << str;
-    OS << "\nOverall allocated size = " << OverallSize;
-  }
-
-protected:
-  using ExtHashBitsTy = uint32_t;
-  using EntryDataTy = KeyDataTy *;
-
-  using HashesPtr = ExtHashBitsTy *;
-  using DataPtr = EntryDataTy *;
-
-  // Bucket structure. Keeps bucket data.
-  struct Bucket {
-    Bucket() = default;
-
-    // Size of bucket.
-    uint32_t Size = 0;
-
-    // Number of non-null entries.
-    size_t NumberOfEntries = 0;
-
-    // Hashes for [Size] entries.
-    HashesPtr Hashes = nullptr;
-
-    // [Size] entries.
-    DataPtr Entries = nullptr;
-
-    // Mutex for this bucket.
-    std::mutex Guard;
-  };
-
-  // Reallocate and rehash bucket if this is full enough.
-  void RehashBucket(Bucket &CurBucket) {
-    assert((CurBucket.Size > 0) && "Uninitialised bucket");
-    if (CurBucket.NumberOfEntries < CurBucket.Size * 0.9)
-      return;
-
-    if (CurBucket.Size >= MaxBucketSize)
-      report_fatal_error("ConcurrentHashTable is full");
-
-    size_t NewBucketSize = CurBucket.Size << 1;
-    assert((NewBucketSize <= MaxBucketSize) && "New bucket size is too big");
-    assert((CurBucket.Size < NewBucketSize) &&
-           "New bucket size less than size of current bucket");
-
-    // Store old entries & hashes arrays.
-    HashesPtr SrcHashes = CurBucket.Hashes;
-    DataPtr SrcEntries = CurBucket.Entries;
-
-    // Allocate new entries&hashes arrays.
-    HashesPtr DestHashes = new ExtHashBitsTy[NewBucketSize];
-    memset(DestHashes, 0, sizeof(ExtHashBitsTy) * NewBucketSize);
-
-    DataPtr DestEntries = new EntryDataTy[NewBucketSize];
-    memset(DestEntries, 0, sizeof(EntryDataTy) * NewBucketSize);
-
-    // For each entry in source arrays...
-    for (size_t CurSrcEntryIdx = 0; CurSrcEntryIdx < CurBucket.Size;
-         CurSrcEntryIdx++) {
-      uint32_t CurSrcEntryHashBits = SrcHashes[CurSrcEntryIdx];
-
-      // Check for null entry.
-      if (CurSrcEntryHashBits == 0 && SrcEntries[CurSrcEntryIdx] == nullptr)
-        continue;
-
-      size_t StartDestIdx = getStartIdx(CurSrcEntryHashBits, NewBucketSize);
-
-      // Insert non-null entry into the new arrays.
-      while (true) {
-        uint32_t CurDestEntryHashBits = DestHashes[StartDestIdx];
-
-        if (CurDestEntryHashBits == 0 && DestEntries[StartDestIdx] == nullptr) {
-          // Found empty slot. Insert data.
-          DestHashes[StartDestIdx] = CurSrcEntryHashBits;
-          DestEntries[StartDestIdx] = SrcEntries[CurSrcEntryIdx];
-          break;
-        }
-
-        StartDestIdx++;
-        StartDestIdx = StartDestIdx & (NewBucketSize - 1);
-      }
-    }
-
-    // Update bucket fields.
-    CurBucket.Hashes = DestHashes;
-    CurBucket.Entries = DestEntries;
-    CurBucket.Size = NewBucketSize;
-
-    // Delete old bucket entries.
-    if (SrcHashes != nullptr)
-      delete[] SrcHashes;
-    if (SrcEntries != nullptr)
-      delete[] SrcEntries;
-  }
-
-  size_t getBucketIdx(hash_code Hash) { return Hash & HashMask; }
-
-  uint32_t getExtHashBits(uint64_t Hash) {
-    return (Hash & ExtHashMask) >> HashBitsNum;
-  }
-
-  size_t getStartIdx(uint32_t ExtHashBits, size_t BucketSize) {
-    assert((BucketSize > 0) && "Empty bucket");
-
-    return ExtHashBits & (BucketSize - 1);
-  }
-
-  // Number of bits in hash mask.
-  uint64_t HashBitsNum = 0;
-
-  // Hash mask.
-  uint64_t HashMask = 0;
-
-  // Hash mask for the extended hash bits.
-  uint64_t ExtHashMask = 0;
-
-  // The maximal bucket size.
-  size_t MaxBucketSize = 0;
-
-  // Initial size of bucket.
-  size_t InitialBucketSize = 0;
-
-  // The number of buckets.
-  size_t NumberOfBuckets = 0;
-
-  // Array of buckets.
-  std::unique_ptr<Bucket[]> BucketsArray;
-
-  // Used for allocating KeyDataTy values.
-  AllocatorTy &MultiThreadAllocator;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_ADT_CONCURRENTHASHTABLE_H
diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index c5190255ba773..900294d4216ee 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -17,7 +17,6 @@ add_llvm_unittest(ADTTests
   BumpPtrListTest.cpp
   CoalescingBitVectorTest.cpp
   CombinationGeneratorTest.cpp
-  ConcurrentHashtableTest.cpp
   DAGDeltaAlgorithmTest.cpp
   DeltaAlgorithmTest.cpp
   DenseMapTest.cpp
diff --git a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
deleted file mode 100644
index c4faece251790..0000000000000
--- a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-//===- ConcurrentHashtableTest.cpp ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ConcurrentHashtable.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Parallel.h"
-#include "gtest/gtest.h"
-#include <limits>
-#include <random>
-#include <vector>
-using namespace llvm;
-
-namespace {
-class String {
-public:
-  String() {}
-  const std::string &getKey() const { return Data; }
-
-  template <typename AllocatorTy>
-  static String *create(const std::string &Num, AllocatorTy &Allocator) {
-    String *Result = Allocator.template Allocate<String>();
-    new (Result) String(Num);
-    return Result;
-  }
-
-protected:
-  String(const std::string &Num) { Data += Num; }
-
-  std::string Data;
-  std::array<char, 0x20> ExtraData;
-};
-
-static LLVM_THREAD_LOCAL BumpPtrAllocator ThreadLocalAllocator;
-class PerThreadAllocator : public AllocatorBase<PerThreadAllocator> {
-public:
-  inline LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size,
-                                                       size_t Alignment) {
-    return ThreadLocalAllocator.Allocate(Size, Align(Alignment));
-  }
-  inline size_t getBytesAllocated() const {
-    return ThreadLocalAllocator.getBytesAllocated();
-  }
-
-  // Pull in base class overloads.
-  using AllocatorBase<PerThreadAllocator>::Allocate;
-} Allocator;
-
-TEST(ConcurrentHashTableTest, AddStringEntries) {
-  ConcurrentHashTableByPtr<
-      std::string, String, PerThreadAllocator,
-      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
-      HashTable(Allocator, 10);
-
-  size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-  std::pair<String *, bool> res1 = HashTable.insert("1");
-  // Check entry is inserted.
-  EXPECT_TRUE(res1.first->getKey() == "1");
-  EXPECT_TRUE(res1.second);
-
-  std::pair<String *, bool> res2 = HashTable.insert("2");
-  // Check old entry is still valid.
-  EXPECT_TRUE(res1.first->getKey() == "1");
-  // Check new entry is inserted.
-  EXPECT_TRUE(res2.first->getKey() == "2");
-  EXPECT_TRUE(res2.second);
-  // Check new and old entries use different memory.
-  EXPECT_TRUE(res1.first != res2.first);
-
-  std::pair<String *, bool> res3 = HashTable.insert("3");
-  // Check one more entry is inserted.
-  EXPECT_TRUE(res3.first->getKey() == "3");
-  EXPECT_TRUE(res3.second);
-
-  std::pair<String *, bool> res4 = HashTable.insert("1");
-  // Check duplicated entry is inserted.
-  EXPECT_TRUE(res4.first->getKey() == "1");
-  EXPECT_FALSE(res4.second);
-  // Check duplicated entry uses the same memory.
-  EXPECT_TRUE(res1.first == res4.first);
-
-  // Check first entry is still valid.
-  EXPECT_TRUE(res1.first->getKey() == "1");
-
-  // Check data was allocated by allocator.
-  EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
-
-  // Check statistic.
-  std::string StatisticString;
-  raw_string_ostream StatisticStream(StatisticString);
-  HashTable.printStatistic(StatisticStream);
-
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 3\n") !=
-              std::string::npos);
-}
-
-TEST(ConcurrentHashTableTest, AddStringMultiplueEntries) {
-  const size_t NumElements = 10000;
-  ConcurrentHashTableByPtr<
-      std::string, String, PerThreadAllocator,
-      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
-      HashTable(Allocator);
-
-  // Check insertion.
-  for (size_t I = 0; I < NumElements; I++) {
-    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-    std::string StringForElement = formatv("{0}", I);
-    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
-    EXPECT_TRUE(Entry.second);
-    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
-    EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
-  }
-
-  std::string StatisticString;
-  raw_string_ostream StatisticStream(StatisticString);
-  HashTable.printStatistic(StatisticStream);
-
-  // Verifying that the table contains exactly the number of elements we
-  // inserted.
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 10000\n") !=
-              std::string::npos);
-
-  // Check insertion of duplicates.
-  for (size_t I = 0; I < NumElements; I++) {
-    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-    std::string StringForElement = formatv("{0}", I);
-    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
-    EXPECT_FALSE(Entry.second);
-    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
-    // Check no additional bytes were allocated for duplicate.
-    EXPECT_TRUE(Allocator.getBytesAllocated() == AllocatedBytesAtStart);
-  }
-
-  // Check statistic.
-  // Verifying that the table contains exactly the number of elements we
-  // inserted.
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 10000\n") !=
-              std::string::npos);
-}
-
-TEST(ConcurrentHashTableTest, AddStringMultiplueEntriesWithResize) {
-  // Number of elements exceeds original size, thus hashtable should be resized.
-  const size_t NumElements = 20000;
-  ConcurrentHashTableByPtr<
-      std::string, String, PerThreadAllocator,
-      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
-      HashTable(Allocator, 100);
-
-  // Check insertion.
-  for (size_t I = 0; I < NumElements; I++) {
-    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-    std::string StringForElement = formatv("{0} {1}", I, I + 100);
-    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
-    EXPECT_TRUE(Entry.second);
-    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
-    EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
-  }
-
-  std::string StatisticString;
-  raw_string_ostream StatisticStream(StatisticString);
-  HashTable.printStatistic(StatisticStream);
-
-  // Verifying that the table contains exactly the number of elements we
-  // inserted.
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 20000\n") !=
-              std::string::npos);
-
-  // Check insertion of duplicates.
-  for (size_t I = 0; I < NumElements; I++) {
-    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-    std::string StringForElement = formatv("{0} {1}", I, I + 100);
-    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
-    EXPECT_FALSE(Entry.second);
-    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
-    // Check no additional bytes were allocated for duplicate.
-    EXPECT_TRUE(Allocator.getBytesAllocated() == AllocatedBytesAtStart);
-  }
-
-  // Check statistic.
-  // Verifying that the table contains exactly the number of elements we
-  // inserted.
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 20000\n") !=
-              std::string::npos);
-}
-
-TEST(ConcurrentHashTableTest, AddStringEntriesParallel) {
-  const size_t NumElements = 10000;
-  ConcurrentHashTableByPtr<
-      std::string, String, PerThreadAllocator,
-      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
-      HashTable(Allocator);
-
-  // Check parallel insertion.
-  parallelFor(0, NumElements, [&](size_t I) {
-    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-    std::string StringForElement = formatv("{0}", I);
-    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
-    EXPECT_TRUE(Entry.second);
-    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
-    EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
-  });
-
-  std::string StatisticString;
-  raw_string_ostream StatisticStream(StatisticString);
-  HashTable.printStatistic(StatisticStream);
-
-  // Verifying that the table contains exactly the number of elements we
-  // inserted.
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 10000\n") !=
-              std::string::npos);
-
-  // Check parallel insertion of duplicates.
-  parallelFor(0, NumElements, [&](size_t I) {
-    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-    std::string StringForElement = formatv("{0}", I);
-    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
-    EXPECT_FALSE(Entry.second);
-    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
-    // Check no additional bytes were allocated for duplicate.
-    EXPECT_TRUE(Allocator.getBytesAllocated() == AllocatedBytesAtStart);
-  });
-
-  // Check statistic.
-  // Verifying that the table contains exactly the number of elements we
-  // inserted.
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 10000\n") !=
-              std::string::npos);
-}
-
-TEST(ConcurrentHashTableTest, AddStringEntriesParallelWithResize) {
-  const size_t NumElements = 20000;
-  ConcurrentHashTableByPtr<
-      std::string, String, PerThreadAllocator,
-      ConcurrentHashTableInfoByPtr<std::string, String, PerThreadAllocator>>
-      HashTable(Allocator, 100);
-
-  // Check parallel insertion.
-  parallelFor(0, NumElements, [&](size_t I) {
-    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-    std::string StringForElement = formatv("{0}", I);
-    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
-    EXPECT_TRUE(Entry.second);
-    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
-    EXPECT_TRUE(Allocator.getBytesAllocated() > AllocatedBytesAtStart);
-  });
-
-  std::string StatisticString;
-  raw_string_ostream StatisticStream(StatisticString);
-  HashTable.printStatistic(StatisticStream);
-
-  // Verifying that the table contains exactly the number of elements we
-  // inserted.
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 20000\n") !=
-              std::string::npos);
-
-  // Check parallel insertion of duplicates.
-  parallelFor(0, NumElements, [&](size_t I) {
-    size_t AllocatedBytesAtStart = Allocator.getBytesAllocated();
-    std::string StringForElement = formatv("{0}", I);
-    std::pair<String *, bool> Entry = HashTable.insert(StringForElement);
-    EXPECT_FALSE(Entry.second);
-    EXPECT_TRUE(Entry.first->getKey() == StringForElement);
-    // Check no additional bytes were allocated for duplicate.
-    EXPECT_TRUE(Allocator.getBytesAllocated() == AllocatedBytesAtStart);
-  });
-
-  // Check statistic.
-  // Verifying that the table contains exactly the number of elements we
-  // inserted.
-  EXPECT_TRUE(StatisticString.find("Overall number of entries = 20000\n") !=
-              std::string::npos);
-}
-
-} // namespace

From 4f9929add5369490c9607fc9203761483d2bc916 Mon Sep 17 00:00:00 2001
From: Qiongsi Wu <qwu@ibm.com>
Date: Thu, 23 Mar 2023 09:16:18 -0400
Subject: [PATCH 125/208] [AIX][CodeGen] Storage Locations for Constant
 Pointers

This patch adds an `llc` option `-mroptr` to specify storage locations for constant pointers on AIX.

When the `-mroptr` option is specified, constant pointers, virtual function tables, and virtual type tables are placed in read-only storage. Otherwise, by default, pointers, virtual function tables, and virtual type tables are placed are placed in read/write storage.

https://reviews.llvm.org/D144190 enables the `-mroptr` option for `clang`.

Reviewed By: hubert.reinterpretcast, stephenpeckham, myhsu, MaskRay, serge-sans-paille

Differential Revision: https://reviews.llvm.org/D144189
---
 llvm/docs/ReleaseNotes.rst                    |  6 +++-
 llvm/include/llvm/CodeGen/CommandFlags.h      |  2 ++
 llvm/include/llvm/Target/TargetOptions.h      | 19 ++++++-----
 llvm/lib/CodeGen/CommandFlags.cpp             |  9 +++++
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  | 20 ++++++++---
 llvm/test/CodeGen/PowerPC/aix-xcoff-roptr.ll  | 30 +++++++++++++++++
 .../test/CodeGen/PowerPC/aix64-xcoff-roptr.ll | 33 +++++++++++++++++++
 llvm/tools/llc/llc.cpp                        | 18 ++++++++++
 8 files changed, 124 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-xcoff-roptr.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/aix64-xcoff-roptr.ll

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 525f57a90dfb0..6f78497644479 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -122,7 +122,11 @@ Changes to the MIPS Backend
 Changes to the PowerPC Backend
 ------------------------------
 
-* ...
+* A new option ``-mroptr`` is added to ``clang`` and ``llc``. When this option
+  is present, constant objects with relocatable address values are put into the
+  RO data section. This option should be used with the ``-fdata-sections``
+  option, and is not supported with ``-fno-data-sections``. The option is
+  only supported on AIX.
 
 Changes to the RISC-V Backend
 -----------------------------
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 475d87bdd5b13..19b466629dbfc 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -143,6 +143,8 @@ unsigned getAlignLoops();
 
 bool getJMCInstrument();
 
+bool getXCOFFReadOnlyPointers();
+
 /// Create this object with static storage to register codegen-related command
 /// line options.
 struct RegisterCodeGenFlags {
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 22e811653c6d4..76e4248088afd 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -130,13 +130,12 @@ namespace llvm {
           HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
           GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
           EnableFastISel(false), EnableGlobalISel(false), UseInitArray(false),
-          DisableIntegratedAS(false),
-          RelaxELFRelocations(true), FunctionSections(false),
-          DataSections(false), IgnoreXCOFFVisibility(false),
-          XCOFFTracebackTable(true), UniqueSectionNames(true),
-          UniqueBasicBlockSectionNames(false), TrapUnreachable(false),
-          NoTrapAfterNoreturn(false), TLSSize(0), EmulatedTLS(false),
-          ExplicitEmulatedTLS(false), EnableIPRA(false),
+          DisableIntegratedAS(false), RelaxELFRelocations(true),
+          FunctionSections(false), DataSections(false),
+          IgnoreXCOFFVisibility(false), XCOFFTracebackTable(true),
+          UniqueSectionNames(true), UniqueBasicBlockSectionNames(false),
+          TrapUnreachable(false), NoTrapAfterNoreturn(false), TLSSize(0),
+          EmulatedTLS(false), ExplicitEmulatedTLS(false), EnableIPRA(false),
           EmitStackSizeSection(false), EnableMachineOutliner(false),
           EnableMachineFunctionSplitter(false), SupportsDefaultOutlining(false),
           EmitAddrsig(false), EmitCallSiteInfo(false),
@@ -144,7 +143,7 @@ namespace llvm {
           ValueTrackingVariableLocations(false), ForceDwarfFrameSection(false),
           XRayOmitFunctionIndex(false), DebugStrictDwarf(false),
           Hotpatch(false), PPCGenScalarMASSEntries(false), JMCInstrument(false),
-          EnableCFIFixup(false), MisExpect(false),
+          EnableCFIFixup(false), MisExpect(false), XCOFFReadOnlyPointers(false),
           FPDenormalMode(DenormalMode::IEEE, DenormalMode::IEEE) {}
 
     /// DisableFramePointerElim - This returns true if frame pointer elimination
@@ -360,6 +359,10 @@ namespace llvm {
     /// By default, it is set to false
     unsigned MisExpect : 1;
 
+    /// When set to true, const objects with relocatable address values are put
+    /// into the RO data section.
+    unsigned XCOFFReadOnlyPointers : 1;
+
     /// Name of the stack usage file (i.e., .su file) if user passes
     /// -fstack-usage. If empty, it can be implied that -fstack-usage is not
     /// passed on the command line.
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 51d259cea41b8..5ef650787a585 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -103,6 +103,7 @@ CGOPT(bool, XRayOmitFunctionIndex)
 CGOPT(bool, DebugStrictDwarf)
 CGOPT(unsigned, AlignLoops)
 CGOPT(bool, JMCInstrument)
+CGOPT(bool, XCOFFReadOnlyPointers)
 
 codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
 #define CGBINDOPT(NAME)                                                        \
@@ -478,6 +479,13 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(JMCInstrument);
 
+  static cl::opt<bool> XCOFFReadOnlyPointers(
+      "mroptr",
+      cl::desc("When set to true, const objects with relocatable address "
+               "values are put into the RO data section."),
+      cl::init(false));
+  CGBINDOPT(XCOFFReadOnlyPointers);
+
 #undef CGBINDOPT
 
   mc::RegisterMCTargetOptionsFlags();
@@ -554,6 +562,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.DebugStrictDwarf = getDebugStrictDwarf();
   Options.LoopAlignment = getAlignLoops();
   Options.JMCInstrument = getJMCInstrument();
+  Options.XCOFFReadOnlyPointers = getXCOFFReadOnlyPointers();
 
   Options.MCOptions = mc::InitMCTargetOptionsFromFlags();
 
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index e2fbe027f15b0..c81b6bb623b96 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -2343,8 +2343,11 @@ MCSection *TargetLoweringObjectFileXCOFF::getExplicitSectionGlobal(
   XCOFF::StorageMappingClass MappingClass;
   if (Kind.isText())
     MappingClass = XCOFF::XMC_PR;
-  else if (Kind.isData() || Kind.isReadOnlyWithRel() || Kind.isBSS())
+  else if (Kind.isData() || Kind.isBSS())
     MappingClass = XCOFF::XMC_RW;
+  else if (Kind.isReadOnlyWithRel())
+    MappingClass =
+        TM.Options.XCOFFReadOnlyPointers ? XCOFF::XMC_RO : XCOFF::XMC_RW;
   else if (Kind.isReadOnly())
     MappingClass = XCOFF::XMC_RO;
   else
@@ -2429,9 +2432,18 @@ MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
     return TextSection;
   }
 
-  // TODO: We may put Kind.isReadOnlyWithRel() under option control, because
-  // user may want to have read-only data with relocations placed into a
-  // read-only section by the compiler.
+  if (TM.Options.XCOFFReadOnlyPointers && Kind.isReadOnlyWithRel()) {
+    if (!TM.getDataSections())
+      report_fatal_error(
+          "ReadOnlyPointers is supported only if data sections is turned on");
+
+    SmallString<128> Name;
+    getNameWithPrefix(Name, GO, TM);
+    return getContext().getXCOFFSection(
+        Name, SectionKind::getReadOnly(),
+        XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD));
+  }
+
   // For BSS kind, zero initialized data must be emitted to the .data section
   // because external linkage control sections that get mapped to the .bss
   // section will be linked as tentative defintions, which is only appropriate
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-roptr.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-roptr.ll
new file mode 100644
index 0000000000000..532d17e087e5b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-roptr.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -mroptr < %s | FileCheck %s
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -mroptr -filetype=obj -o %t.o < %s
+; RUN: llvm-objdump -t --symbol-description %t.o | FileCheck %s --check-prefix=OBJ
+
+; RUN: not llc -mtriple powerpc-ibm-aix-xcoff -mroptr -data-sections=false \
+; RUN: < %s 2>&1 | FileCheck %s --check-prefix=DS_ERR
+
+; DS_ERR: -mroptr option must be used with -data-sections
+
+%union.U = type { %"struct.U::A" }
+%"struct.U::A" = type { ptr }
+
+@_ZL1p = internal constant i32 ptrtoint (ptr @_ZL1p to i32), align 4
+; CHECK:         .csect _ZL1p[RO],2
+; CHECK-NEXT:    .lglobl	_ZL1p[RO]
+; CHECK-NEXT:    .align	2
+; CHECK-NEXT:    .vbyte	4, _ZL1p[RO]
+; OBJ-DAG: {{([[:xdigit:]]{8})}} l .text {{([[:xdigit:]]{8})}} (idx: [[#]]) _ZL1p[RO]
+@q = thread_local constant ptr @_ZL1p, align 4
+; CHECK:         .csect q[TL],2
+; CHECK-NEXT:    .globl	q[TL]
+; CHECK-NEXT:    .align	2
+; CHECK-NEXT:    .vbyte	4, _ZL1p[RO]
+; OBJ-DAG: {{([[:xdigit:]]{8})}} g O .tdata {{([[:xdigit:]]{8})}} (idx: [[#]]) q[TL]
+@u = local_unnamed_addr constant [1 x %union.U] [%union.U { %"struct.U::A" { ptr @_ZL1p } }], align 4
+; CHECK:         .csect u[RO],2
+; CHECK-NEXT:    .globl	u[RO]
+; CHECK-NEXT:    .align	2
+; CHECK-NEXT:    .vbyte	4, _ZL1p[RO]
+; OBJ-DAG: {{([[:xdigit:]]{8})}} g .text {{([[:xdigit:]]{8})}} (idx: [[#]]) u[RO]
diff --git a/llvm/test/CodeGen/PowerPC/aix64-xcoff-roptr.ll b/llvm/test/CodeGen/PowerPC/aix64-xcoff-roptr.ll
new file mode 100644
index 0000000000000..aff753661b0e1
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix64-xcoff-roptr.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -mroptr < %s | FileCheck %s
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -mroptr -filetype=obj -o %t.o < %s
+; RUN: llvm-objdump -t --symbol-description %t.o | FileCheck %s --check-prefix=OBJ
+
+; RUN: not llc -mtriple powerpc64-ibm-aix-xcoff -mroptr -data-sections=false \
+; RUN: < %s 2>&1 | FileCheck %s --check-prefix=DS_ERR
+; RUN: not llc -mtriple powerpc64le-unknown-linux-gnu -mroptr \
+; RUN: < %s 2>&1 | FileCheck %s --check-prefix=OS_ERR
+
+; DS_ERR: -mroptr option must be used with -data-sections
+; OS_ERR: -mroptr option is only supported on AIX
+
+%union.U = type { %"struct.U::A" }
+%"struct.U::A" = type { ptr }
+
+@_ZL1p = internal constant i64 ptrtoint (ptr @_ZL1p to i64), align 8
+; CHECK:         .csect _ZL1p[RO],3
+; CHECK-NEXT:    .lglobl	_ZL1p[RO]
+; CHECK-NEXT:    .align	3
+; CHECK-NEXT:    .vbyte	8, _ZL1p[RO]
+; OBJ-DAG: {{([[:xdigit:]]{16})}} l .text {{([[:xdigit:]]{16})}} (idx: [[#]]) _ZL1p[RO]
+@q = thread_local constant ptr @_ZL1p, align 8
+; CHECK:         .csect q[TL],3
+; CHECK-NEXT:    .globl	q[TL]
+; CHECK-NEXT:    .align	3
+; CHECK-NEXT:    .vbyte	8, _ZL1p[RO]
+; OBJ-DAG: {{([[:xdigit:]]{16})}} g O .tdata {{([[:xdigit:]]{16})}} (idx: [[#]]) q[TL]
+@u = local_unnamed_addr constant [1 x %union.U] [%union.U { %"struct.U::A" { ptr @_ZL1p } }], align 8
+; CHECK:         .csect u[RO],3
+; CHECK-NEXT:    .globl	u[RO]
+; CHECK-NEXT:    .align	3
+; CHECK-NEXT:    .vbyte	8, _ZL1p[RO]
+; OBJ-DAG: {{([[:xdigit:]]{16})}} g .text {{([[:xdigit:]]{16})}} (idx: [[#]]) u[RO]
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index ed65b83487790..860fa39d57e8a 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -496,6 +496,24 @@ static int compileModule(char **argv, LLVMContext &Context) {
   TargetOptions Options;
   auto InitializeOptions = [&](const Triple &TheTriple) {
     Options = codegen::InitTargetOptionsFromCodeGenFlags(TheTriple);
+
+    if (Options.XCOFFReadOnlyPointers) {
+      if (!TheTriple.isOSAIX())
+        reportError("-mroptr option is only supported on AIX", InputFilename);
+
+      // Since the storage mapping class is specified per csect,
+      // without using data sections, it is less effective to use read-only
+      // pointers. Using read-only pointers may cause other RO variables in the
+      // same csect to become RW when the linker acts upon `-bforceimprw`;
+      // therefore, we require that separate data sections are used in the
+      // presence of ReadOnlyPointers. We respect the setting of data-sections
+      // since we have not found reasons to do otherwise that overcome the user
+      // surprise of not respecting the setting.
+      if (!Options.DataSections)
+        reportError("-mroptr option must be used with -data-sections",
+                    InputFilename);
+    }
+
     Options.BinutilsVersion =
         TargetMachine::parseBinutilsVersion(BinutilsVersion);
     Options.DisableIntegratedAS = NoIntegratedAssembler;

From 5193c4a8b38c3e61c862d5badf1cace7c26324f7 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 23 Mar 2023 13:46:49 +0000
Subject: [PATCH 126/208] [lldb][AArch64] Fix run-qemu.sh when only MTE is
 enabled.

SVE and MTE both require a CPU with that feature before
you can use the other options, but we only added the "max"
cpu when SVE was enabled too.
---
 lldb/scripts/lldb-test-qemu/run-qemu.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 lldb/scripts/lldb-test-qemu/run-qemu.sh

diff --git a/lldb/scripts/lldb-test-qemu/run-qemu.sh b/lldb/scripts/lldb-test-qemu/run-qemu.sh
old mode 100644
new mode 100755
index 339b8d955e613..d11711c10e772
--- a/lldb/scripts/lldb-test-qemu/run-qemu.sh
+++ b/lldb/scripts/lldb-test-qemu/run-qemu.sh
@@ -109,8 +109,12 @@ elif [[ "$ARCH" == "arm64" ]]; then
   QEMU_SVE_MAX_VQ=4
   QEMU_CPU="cortex-a53"
 
+  if [[ $SVE ]] || [[ $MTE ]]; then
+    QEMU_CPU="max"
+  fi
+
   if [[ $SVE ]]; then
-    QEMU_CPU="max,sve-max-vq=$QEMU_SVE_MAX_VQ"
+    QEMU_CPU="$QEMU_CPU,sve-max-vq=$QEMU_SVE_MAX_VQ"
   fi
   if [[ $MTE ]]; then
     QEMU_MACHINE="$QEMU_MACHINE,mte=on"

From 3ab79124db5e4e1be0b58c4fe43ff01e6fdb3060 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Thu, 23 Mar 2023 08:56:47 -0500
Subject: [PATCH 127/208] [OpenMP] Add notifyDataUnmapped back in
 disassociatePtr

Fix regression introduced by https://reviews.llvm.org/D123446

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D146689
---
 openmp/libomptarget/src/device.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 09c8e808db463..22ab7436f75b5 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -134,7 +134,7 @@ int DeviceTy::disassociatePtr(void *HstPtrBegin) {
     if (Event)
       destroyEvent(Event);
     HDTTMap->erase(It);
-    return OFFLOAD_SUCCESS;
+    return notifyDataUnmapped(HstPtrBegin);
   }
 
   REPORT("Trying to disassociate a pointer which was not mapped via "

From 0eabf59528f3c3f64923900cae740d9f26c45ae8 Mon Sep 17 00:00:00 2001
From: Doru Bercea <doru.bercea@amd.com>
Date: Tue, 21 Mar 2023 14:07:57 -0400
Subject: [PATCH 128/208] Enable constexpr class members that are device-mapped
 to not be optimized out.

This patch fixes an issue whereby a constexpr class member which is
mapped to the device is being optimized out thus leading to a runtime
error.

Patch: https://reviews.llvm.org/D146552
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  4 +-
 .../declare_target_constexpr_codegen.cpp      | 40 +++++++++++++++++++
 .../offloading/target_constexpr_mapping.cpp   | 34 ++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/OpenMP/declare_target_constexpr_codegen.cpp
 create mode 100644 openmp/libomptarget/test/offloading/target_constexpr_mapping.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 5f21cfca66bb8..58a95d64ac50e 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10387,7 +10387,9 @@ void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD,
     }
     Linkage = CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false);
     // Temp solution to prevent optimizations of the internal variables.
-    if (CGM.getLangOpts().OpenMPIsDevice && !VD->isExternallyVisible()) {
+    if (CGM.getLangOpts().OpenMPIsDevice &&
+        (!VD->isExternallyVisible() ||
+         Linkage == llvm::GlobalValue::LinkOnceODRLinkage)) {
       // Do not create a "ref-variable" if the original is not also available
       // on the host.
       if (!OffloadEntriesInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
diff --git a/clang/test/OpenMP/declare_target_constexpr_codegen.cpp b/clang/test/OpenMP/declare_target_constexpr_codegen.cpp
new file mode 100644
index 0000000000000..27161feef05e0
--- /dev/null
+++ b/clang/test/OpenMP/declare_target_constexpr_codegen.cpp
@@ -0,0 +1,40 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --prefix-filecheck-ir-name _ --global-value-regex "llvm.compiler.used" "_[0-9a-zA-Z]+A[0-9a-zA-Z]+pi[0-9a-zA-Z]+" "_[0-9a-zA-Z]+anotherPi" --version 2
+// REQUIRES: amdgpu-registered-target
+
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare target
+class A {
+public:
+  static constexpr double pi = 3.141592653589793116;
+//.
+// CHECK: @_ZN1A2piE = linkonce_odr constant double 0x400921FB54442D18, comdat, align 8
+// CHECK: @_ZL9anotherPi = internal constant double 3.140000e+00, align 8
+// CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr @"__ZN1A2piE$ref", ptr @"__ZL9anotherPi$ref"], section "llvm.metadata"
+//.
+  A() { ; }
+  ~A() { ; }
+};
+#pragma omp end declare target
+
+void F(const double &);
+void Test() { F(A::pi); }
+
+#pragma omp declare target
+constexpr static double anotherPi = 3.14;
+#pragma omp end declare target
+
+#endif
+
+
+//
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/openmp/libomptarget/test/offloading/target_constexpr_mapping.cpp b/openmp/libomptarget/test/offloading/target_constexpr_mapping.cpp
new file mode 100644
index 0000000000000..14cf92a7cc26e
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/target_constexpr_mapping.cpp
@@ -0,0 +1,34 @@
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp declare target
+class A {
+public:
+  constexpr static double pi = 3.141592653589793116;
+  A() { ; }
+  ~A() { ; }
+};
+#pragma omp end declare target
+
+#pragma omp declare target
+constexpr static double anotherPi = 3.14;
+#pragma omp end declare target
+
+int main() {
+  double a[2];
+#pragma omp target map(tofrom : a[:2])
+  {
+    a[0] = A::pi;
+    a[1] = anotherPi;
+  }
+
+  // CHECK: pi = 3.141592653589793116
+  printf("pi = %.18f\n", a[0]);
+
+  // CHECK: anotherPi = 3.14
+  printf("anotherPi = %.2f\n", a[1]);
+
+  return 0;
+}

From c7a3284de3059ecb5940dac19dda897ade0d11b4 Mon Sep 17 00:00:00 2001
From: khei4 <kk.asano.luxy@gmail.com>
Date: Thu, 9 Mar 2023 15:31:11 +0900
Subject: [PATCH 129/208] [AggressiveInstCombine] Pre-Commit test for D144445
 (NFC)

Differential Revision: https://reviews.llvm.org/D145355

tweak: test
---
 .../AggressiveInstCombine/patterned-load.ll   | 189 ++++++++++++++++++
 .../InstSimplify/load-patterned-aggregates.ll | 134 -------------
 llvm/test/Transforms/InstSimplify/load.ll     |  20 ++
 3 files changed, 209 insertions(+), 134 deletions(-)
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
 delete mode 100644 llvm/test/Transforms/InstSimplify/load-patterned-aggregates.ll

diff --git a/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll b/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
new file mode 100644
index 0000000000000..5410a21e3211d
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=aggressive-instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=CHECK,LE
+; RUN: opt < %s -passes=aggressive-instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=CHECK,BE
+
+
+@constarray1 = internal constant [8 x i8]  c"\01\00\01\00\01\00\01\00", align 4
+@constarray2 = internal constant [8 x i8]  c"\FF\FF\01\00\01\00\01\00", align 4
+
+@g = internal constant i32 42
+@constptrarray = internal constant [4 x ptr]  [ptr @g, ptr @g, ptr @g, ptr @g], align 4
+
+@constpackedstruct = internal constant <{[8 x i8]}>  <{[8 x i8] c"\01\00\01\00\01\00\01\00"}>, align 4
+@conststruct = internal constant {i16, [8 x i8]}  {i16 1, [8 x i8] c"\01\00\01\00\01\00\01\00"}, align 4
+
+; TODO: this will be ret i8 1
+define i8 @inbounds_gep_load_i8_align2(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_load_i8_align2(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray1, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 2
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %1 = getelementptr inbounds i8, ptr @constarray1, i64 %idx
+  %2 = load i8, ptr %1, align 2
+  ret i8 %2
+}
+
+; can't be folded because access with i8 strides is not patterned.
+define i8 @inbounds_gep_load_i8_align1(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_load_i8_align1(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray1, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %1 = getelementptr inbounds i8, ptr @constarray1, i64 %idx
+  %2 = load i8, ptr %1, align 1
+  ret i8 %2
+}
+
+; can't be folded because volatile load cannot assure same results.
+define i8 @inbounds_gep_load_i8_align2_volatile(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_load_i8_align2_volatile(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray1, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load volatile i8, ptr [[TMP1]], align 2
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %1 = getelementptr inbounds i8, ptr @constarray1, i64 %idx
+  %2 = load volatile i8, ptr %1, align 2
+  ret i8 %2
+}
+
+declare ptr @llvm.ptrmask.p0.i64(ptr , i64)
+
+; can't be folded because ptrmask can change ptr, while preserving provenance
+define i8 @inbounds_gep_load_i8_align2_ptrmasked(i64 %idx, i64 %mask){
+; CHECK-LABEL: @inbounds_gep_load_i8_align2_ptrmasked(
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 2
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %1 = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 %mask)
+  %2 = getelementptr inbounds i8, ptr %1, i64 %idx
+  %3 = load i8, ptr %2, align 2
+  ret i8 %3
+}
+
+; TODO: this will be ret i32 65537(LE), 16777472(BE)
+define i32 @inbounds_gep_i16_load_i32_align1(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_i16_load_i32_align1(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray1, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = getelementptr inbounds i16, ptr @constarray1, i64 %idx
+  %2 = load i32, ptr %1, align 1
+  ret i32 %2
+}
+
+; TODO: this will be ret i32 65537(LE), 16777472(BE)
+define i32 @inbounds_gep_i32_load_i32_align8(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_i32_load_i32_align8(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constarray1, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = getelementptr inbounds i32, ptr @constarray1, i64 %idx
+  %2 = load i32, ptr %1, align 8
+  ret i32 %2
+}
+
+; TODO: this will be ret i32 65547(LE), 16777472(BE)
+define i32 @inbounds_gep_i32_load_i32_const_offset(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_i32_load_i32_const_offset(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray2, i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = getelementptr inbounds i16, ptr @constarray2, i64 1
+  %2 = getelementptr inbounds i32, ptr %1, i64 %idx
+  %3 = load i32, ptr %2, align 4
+  ret i32 %3
+}
+
+; TODO: this coould be folded into 65537(LE), 16777472(BE)
+define i32 @gep_load_i32_align2_const_offset(i64 %idx){
+; CHECK-LABEL: @gep_load_i32_align2_const_offset(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr @constarray1, i64 -2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [3 x i16], ptr [[TMP1]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 2
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = getelementptr i16, ptr @constarray1, i64 -2
+  %2 = getelementptr [3 x i16], ptr %1, i64 %idx
+  %3 = load i32, ptr %2, align 2
+  ret i32 %3
+}
+
+; can't be folded because if gep is non-inbounds,
+; the offsets are silently-wrapped with two’s complement arithmetic(mod 2**64).
+; So the load operand can be a base pointer of constarray2.
+define i32 @gep_load_i32_align2_const_offset_wrap(i64 %idx){
+; CHECK-LABEL: @gep_load_i32_align2_const_offset_wrap(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr @constarray2, i64 -2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [3 x i16], ptr [[TMP1]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 2
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = getelementptr i16, ptr @constarray2, i64 -2
+  %2 = getelementptr [3 x i16], ptr %1, i64 %idx
+  %3 = load i32, ptr %2, align 2
+  ret i32 %3
+}
+
+; TODO: this will be ret i32 42
+define i32 @inbounds_gep_i32_load_i32_const_ptr_array(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_i32_load_i32_const_ptr_array(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr @constptrarray, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = getelementptr inbounds ptr, ptr @constptrarray, i64 %idx
+  %2 = load ptr, ptr %1, align 4
+  %3 = load i32, ptr %2, align 4
+  ret i32 %3
+}
+
+; TODO: this coould be folded into 65537(LE), 16777472(BE)
+define i32 @inbounds_gep_i32_load_i32_align4_packedstruct(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constpackedstruct, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = getelementptr inbounds i32, ptr @constpackedstruct, i64 %idx
+  %2 = load i32, ptr %1, align 4
+  ret i32 %2
+}
+
+; can't be folded because results are not equal
+define i32 @inbounds_gep_i8_load_i32_align1_packedstruct(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_i8_load_i32_align1_packedstruct(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constpackedstruct, i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = getelementptr inbounds i8, ptr @constpackedstruct, i64 %idx
+  %2 = load i32, ptr %1, align 1
+  ret i32 %2
+}
+
+; TODO: this coould be folded into 65537(LE), 16777472(BE)
+define i32 @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){
+; CHECK-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = getelementptr inbounds i16, ptr @conststruct, i64 1
+  %2 = getelementptr inbounds i32, ptr %1, i64 %idx
+  %3 = load i32, ptr %2, align 4
+  ret i32 %3
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; BE: {{.*}}
+; LE: {{.*}}
diff --git a/llvm/test/Transforms/InstSimplify/load-patterned-aggregates.ll b/llvm/test/Transforms/InstSimplify/load-patterned-aggregates.ll
deleted file mode 100644
index 82283648936cf..0000000000000
--- a/llvm/test/Transforms/InstSimplify/load-patterned-aggregates.ll
+++ /dev/null
@@ -1,134 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
-@constzeroarray = internal constant [4 x i32] zeroinitializer
-
-@constarray = internal constant [8 x i8]  c"\01\00\01\00\01\00\01\00", align 4
-@conststruct = internal constant <{[8 x i8]}>  <{[8 x i8] c"\01\00\01\00\01\00\01\00"}>, align 4
-
-define i32 @load_gep_const_zero_array(i64 %idx) {
-; CHECK-LABEL: @load_gep_const_zero_array(
-; CHECK-NEXT:    ret i32 0
-;
-  %gep = getelementptr inbounds [4 x i32], ptr @constzeroarray, i64 0, i64 %idx
-  %load = load i32, ptr %gep
-  ret i32 %load
-}
-
-define i8 @load_i8_multi_gep_const_zero_array(i64 %idx1, i64 %idx2) {
-; CHECK-LABEL: @load_i8_multi_gep_const_zero_array(
-; CHECK-NEXT:    ret i8 0
-;
-  %gep1 = getelementptr inbounds i8, ptr @constzeroarray, i64 %idx1
-  %gep = getelementptr inbounds i8, ptr %gep1, i64 %idx2
-  %load = load i8, ptr %gep
-  ret i8 %load
-}
-
-
-define i32 @load_gep_const_patterned_array(i64 %idx) {
-; CHECK-LABEL: @load_gep_const_patterned_array(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @constarray, i64 0, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-  %gep = getelementptr inbounds [4 x i32], ptr @constarray, i64 0, i64 %idx
-  %load = load i32, ptr %gep
-  ret i32 %load
-}
-
-define i8 @load_i8_multi_gep_const_array(i64 %idx1, i64 %idx2) {
-; CHECK-LABEL: @load_i8_multi_gep_const_array(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr @constarray, i64 [[IDX1:%.*]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 [[IDX2:%.*]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT:    ret i8 [[LOAD]]
-;
-  %gep1 = getelementptr inbounds i8, ptr @constarray, i64 %idx1
-  %gep = getelementptr inbounds i8, ptr %gep1, i64 %idx2
-  %load = load i8, ptr %gep
-  ret i8 %load
-}
-
-; TODO: this should be ret i8 1
-define i8 @gep_load_i8_align2(i64 %idx){
-; CHECK-LABEL: @gep_load_i8_align2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 2
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %1 = getelementptr inbounds i8, ptr @constarray, i64 %idx
-  %2 = load i8, ptr %1, align 2
-  ret i8 %2
-}
-
-; can't be folded
-define i8 @gep_load_i8_align1(i64 %idx){
-; CHECK-LABEL: @gep_load_i8_align1(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %1 = getelementptr inbounds i8, ptr @constarray, i64 %idx
-  %2 = load i8, ptr %1, align 1
-  ret i8 %2
-}
-
-; TODO: this should be ret i8 65537 on the case for little endian
-define i32 @gep_i32_load_i32_align4(i64 %idx){
-; CHECK-LABEL: @gep_i32_load_i32_align4(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constarray, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    ret i32 [[TMP2]]
-;
-  %1 = getelementptr inbounds i32, ptr @constarray, i64 %idx
-  %2 = load i32, ptr %1, align 4
-  ret i32 %2
-}
-
-; TODO: this should be ret i8 65537 on the case for little endian
-define i32 @gep_i32_load_i32_align4_struct(i64 %idx){
-; CHECK-LABEL: @gep_i32_load_i32_align4_struct(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr @conststruct, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    ret i32 [[TMP2]]
-;
-  %1 = getelementptr inbounds i32, ptr @conststruct, i64 %idx
-  %2 = load i32, ptr %1, align 4
-  ret i32 %2
-}
-
-; can't be folded
-define i32 @gep_i8_load_i32_align1(i64 %idx){
-; CHECK-LABEL: @gep_i8_load_i32_align1(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1
-; CHECK-NEXT:    ret i32 [[TMP2]]
-;
-  %1 = getelementptr inbounds i8, ptr @constarray, i64 %idx
-  %2 = load i32, ptr %1, align 1
-  ret i32 %2
-}
-
-; can't be folded
-define i32 @gep_i8_load_i32_align1_struct(i64 %idx){
-; CHECK-LABEL: @gep_i8_load_i32_align1_struct(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @conststruct, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1
-; CHECK-NEXT:    ret i32 [[TMP2]]
-;
-  %1 = getelementptr inbounds i8, ptr @conststruct, i64 %idx
-  %2 = load i32, ptr %1, align 1
-  ret i32 %2
-}
-; TODO: This could be folded but need to see GEP source types
-define i32 @gep_i16_load_i32_align1(i64 %idx){
-; CHECK-LABEL: @gep_i16_load_i32_align1(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @constarray, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1
-; CHECK-NEXT:    ret i32 [[TMP2]]
-;
-  %1 = getelementptr inbounds i16, ptr @constarray, i64 %idx
-  %2 = load i32, ptr %1, align 1
-  ret i32 %2
-}
diff --git a/llvm/test/Transforms/InstSimplify/load.ll b/llvm/test/Transforms/InstSimplify/load.ll
index 2e2b1b14ddd0f..8b9a607f77b7d 100644
--- a/llvm/test/Transforms/InstSimplify/load.ll
+++ b/llvm/test/Transforms/InstSimplify/load.ll
@@ -3,6 +3,7 @@
 
 @zeroinit = constant {} zeroinitializer
 @poison = constant {} poison
+@constzeroarray = internal constant [4 x i32] zeroinitializer
 
 define i32 @crash_on_zeroinit() {
 ; CHECK-LABEL: @crash_on_zeroinit(
@@ -40,3 +41,22 @@ define <3 x float> @load_vec3() {
   %1 = load <3 x float>, ptr getelementptr inbounds (<3 x float>, ptr @constvec, i64 1)
   ret <3 x float> %1
 }
+
+define i32 @load_gep_const_zero_array(i64 %idx) {
+; CHECK-LABEL: @load_gep_const_zero_array(
+; CHECK-NEXT:    ret i32 0
+;
+  %gep = getelementptr inbounds [4 x i32], ptr @constzeroarray, i64 0, i64 %idx
+  %load = load i32, ptr %gep
+  ret i32 %load
+}
+
+define i8 @load_i8_multi_gep_const_zero_array(i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @load_i8_multi_gep_const_zero_array(
+; CHECK-NEXT:    ret i8 0
+;
+  %gep1 = getelementptr inbounds i8, ptr @constzeroarray, i64 %idx1
+  %gep = getelementptr inbounds i8, ptr %gep1, i64 %idx2
+  %load = load i8, ptr %gep
+  ret i8 %load
+}

From 434b0badb5d53138490a075dd945df7480649154 Mon Sep 17 00:00:00 2001
From: khei4 <kk.asano.luxy@gmail.com>
Date: Thu, 9 Mar 2023 18:46:14 +0900
Subject: [PATCH 130/208] [AggressiveInstCombine] folding load for constant
 global patterened arrays and structs by alignment Differential Revision:
 https://reviews.llvm.org/D144445 Reviewed By: nikic

fix: wrong arrow
---
 .../AggressiveInstCombine.cpp                 | 60 ++++++++++++++++++-
 .../AggressiveInstCombine/patterned-load.ll   | 47 +++++++--------
 2 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 473b41241b8a6..cf652836bef25 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -305,7 +306,7 @@ static bool tryToRecognizePopCount(Instruction &I) {
   Value *MulOp0;
   // Matching "(i * 0x01010101...) >> 24".
   if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
-       match(Op1, m_SpecificInt(MaskShift))) {
+      match(Op1, m_SpecificInt(MaskShift))) {
     Value *ShiftOp0;
     // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
     if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
@@ -401,8 +402,8 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
 /// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
 /// pessimistic codegen that has to account for setting errno and can enable
 /// vectorization.
-static bool
-foldSqrt(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI) {
+static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI,
+                     TargetLibraryInfo &TLI) {
   // Match a call to sqrt mathlib function.
   auto *Call = dyn_cast<CallInst>(&I);
   if (!Call)
@@ -824,6 +825,58 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
   return true;
 }
 
+/// If C is a constant patterned array and all valid loaded results for given
+/// alignment are same to a constant, return that constant.
+static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
+  auto *LI = dyn_cast<LoadInst>(&I);
+  if (!LI || LI->isVolatile())
+    return false;
+
+  // We can only fold the load if it is from a constant global with definitive
+  // initializer. Skip expensive logic if this is not the case.
+  auto *PtrOp = LI->getPointerOperand();
+  auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(PtrOp));
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return false;
+
+  Type *LoadTy = LI->getType();
+  Constant *C = GV->getInitializer();
+
+  // Bail for large initializers in excess of 4K to avoid too many scans.
+  uint64_t GVSize = DL.getTypeAllocSize(C->getType());
+  if (!GVSize || 4096 < GVSize)
+    return false;
+
+  // Check whether pointer arrives back at Global Variable.
+  // If PtrOp is neither GlobalVariable nor GEP, it might not arrive back at
+  // GlobalVariable.
+  // TODO: implement GEP handling
+  unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
+  // TODO: Determine stride based on GEPs.
+  APInt Stride(BW, 1);
+  APInt ConstOffset(BW, 0);
+
+  // Any possible offset could be multiple of GEP stride. And any valid
+  // offset is multiple of load alignment, so checking only multiples of bigger
+  // one is sufficient to say results' equality.
+  if (auto LA = LI->getAlign();
+      LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value())
+    Stride = APInt(BW, LA.value());
+
+  Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL);
+  if (!Ca)
+    return false;
+
+  unsigned E = GVSize - DL.getTypeStoreSize(LoadTy);
+  for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride)
+    if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL))
+      return false;
+
+  I.replaceAllUsesWith(Ca);
+
+  return true;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
@@ -850,6 +903,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA);
+      MadeChange |= foldPatternedLoads(I, DL);
       // NOTE: This function introduces erasing of the instruction `I`, so it
       // needs to be called at the end of this sequence, otherwise we may make
       // bugs.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll b/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
index 5410a21e3211d..7acc6109744ca 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
@@ -12,12 +12,9 @@
 @constpackedstruct = internal constant <{[8 x i8]}>  <{[8 x i8] c"\01\00\01\00\01\00\01\00"}>, align 4
 @conststruct = internal constant {i16, [8 x i8]}  {i16 1, [8 x i8] c"\01\00\01\00\01\00\01\00"}, align 4
 
-; TODO: this will be ret i8 1
 define i8 @inbounds_gep_load_i8_align2(i64 %idx){
 ; CHECK-LABEL: @inbounds_gep_load_i8_align2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray1, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 2
-; CHECK-NEXT:    ret i8 [[TMP2]]
+; CHECK-NEXT:    ret i8 1
 ;
   %1 = getelementptr inbounds i8, ptr @constarray1, i64 %idx
   %2 = load i8, ptr %1, align 2
@@ -53,10 +50,7 @@ declare ptr @llvm.ptrmask.p0.i64(ptr , i64)
 ; can't be folded because ptrmask can change ptr, while preserving provenance
 define i8 @inbounds_gep_load_i8_align2_ptrmasked(i64 %idx, i64 %mask){
 ; CHECK-LABEL: @inbounds_gep_load_i8_align2_ptrmasked(
-; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 2
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    ret i8 1
 ;
   %1 = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 %mask)
   %2 = getelementptr inbounds i8, ptr %1, i64 %idx
@@ -102,13 +96,12 @@ define i32 @inbounds_gep_i32_load_i32_const_offset(i64 %idx){
   ret i32 %3
 }
 
-; TODO: this coould be folded into 65537(LE), 16777472(BE)
 define i32 @gep_load_i32_align2_const_offset(i64 %idx){
-; CHECK-LABEL: @gep_load_i32_align2_const_offset(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr @constarray1, i64 -2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [3 x i16], ptr [[TMP1]], i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 2
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LE-LABEL: @gep_load_i32_align2_const_offset(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @gep_load_i32_align2_const_offset(
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr i16, ptr @constarray1, i64 -2
   %2 = getelementptr [3 x i16], ptr %1, i64 %idx
@@ -146,12 +139,12 @@ define i32 @inbounds_gep_i32_load_i32_const_ptr_array(i64 %idx){
   ret i32 %3
 }
 
-; TODO: this coould be folded into 65537(LE), 16777472(BE)
 define i32 @inbounds_gep_i32_load_i32_align4_packedstruct(i64 %idx){
-; CHECK-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constpackedstruct, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; LE-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct(
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr inbounds i32, ptr @constpackedstruct, i64 %idx
   %2 = load i32, ptr %1, align 4
@@ -172,11 +165,14 @@ define i32 @inbounds_gep_i8_load_i32_align1_packedstruct(i64 %idx){
 
 ; TODO: this coould be folded into 65537(LE), 16777472(BE)
 define i32 @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){
-; CHECK-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
+; BE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1
+; BE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]]
+; BE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; BE-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = getelementptr inbounds i16, ptr @conststruct, i64 1
   %2 = getelementptr inbounds i32, ptr %1, i64 %idx
@@ -184,6 +180,3 @@ define i32 @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){
   ret i32 %3
 }
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; BE: {{.*}}
-; LE: {{.*}}

From 5bcb4c4da99c443fb880d408e5ff4e9b305bbb77 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 23 Mar 2023 07:23:36 -0700
Subject: [PATCH 131/208] [MSAN] Support load and stores of scalable vector
 types

This adds support for scalable vector types - at least far enough to get basic load and store cases working. It turns out that load/store without origin tracking already worked; I apparently got that working with one of the pre patches to use TypeSize utilities and didn't notice. The code changes here are required to enable origin tracking.

For origin tracking, a 4 byte value - the origin - is broadcast into a shadow region whose size exactly matches the type being accessed. This origin is only written if the shadow value is non-zero. The details of how shadow is computed from the original value being stored aren't relevant for this patch.

The code changes involve two related primitives.

First, we need to be able to perform that broadcast into a scalable sized memory region. This requires the use of a loop, and appropriate bound. The fixed size case optimizes with larger stores and alignment; I did not bother with that for the scalable case for now. We can optimize this codepath later if desired.

Second, we need a way to test if the shadow is zero. The mechanism for this in the code is to convert the shadow value into a scalar, and then zero check that. There's an assumption that this scalar is zero exactly when all elements of the shadow value are zero. As a result, we use an OR reduction on the scalable vector. This is analogous to how e.g. an array is handled. I landed a bunch of cleanup changes to remove other direct uses of the scalar conversion to convince myself there were no other undocumented invariants.

Differential Revision: https://reviews.llvm.org/D146157
---
 .../Instrumentation/MemorySanitizer.cpp       |  20 +-
 .../MemorySanitizer/vector-load-store.ll      | 509 ++++++++++++++++++
 2 files changed, 528 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 75cb9e0c0e63d..953ce72c1cec9 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1183,13 +1183,29 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Fill memory range with the given origin value.
   void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
                    TypeSize TS, Align Alignment) {
-    unsigned Size = TS.getFixedValue();
     const DataLayout &DL = F.getParent()->getDataLayout();
     const Align IntptrAlignment = DL.getABITypeAlign(MS.IntptrTy);
     unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
     assert(IntptrAlignment >= kMinOriginAlignment);
     assert(IntptrSize >= kOriginSize);
 
+    // Note: The loop based formation works for fixed length vectors too,
+    // however we prefer to unroll and specialize alignment below.
+    if (TS.isScalable()) {
+      Value *Size = IRB.CreateTypeSize(IRB.getInt32Ty(), TS);
+      Value *RoundUp = IRB.CreateAdd(Size, IRB.getInt32(kOriginSize - 1));
+      Value *End = IRB.CreateUDiv(RoundUp, IRB.getInt32(kOriginSize));
+      auto [InsertPt, Index] =
+        SplitBlockAndInsertSimpleForLoop(End, &*IRB.GetInsertPoint());
+      IRB.SetInsertPoint(InsertPt);
+
+      Value *GEP = IRB.CreateGEP(MS.OriginTy, OriginPtr, Index);
+      IRB.CreateAlignedStore(Origin, GEP, kMinOriginAlignment);
+      return;
+    }
+
+    unsigned Size = TS.getFixedValue();
+
     unsigned Ofs = 0;
     Align CurrentAlignment = Alignment;
     if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
@@ -1575,6 +1591,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (ArrayType *Array = dyn_cast<ArrayType>(V->getType()))
       return collapseArrayShadow(Array, V, IRB);
     if (isa<VectorType>(V->getType())) {
+      if (isa<ScalableVectorType>(V->getType()))
+        return convertShadowToScalar(IRB.CreateOrReduce(V), IRB);
       unsigned BitWidth =
         V->getType()->getPrimitiveSizeInBits().getFixedValue();
       return IRB.CreateBitCast(V, IntegerType::get(*MS.C, BitWidth));
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
index a2245763abbc7..feb8a27fd5410 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
@@ -422,4 +422,513 @@ define void @store.v16i32(ptr %p) sanitize_memory {
   ret void
 }
 
+define void @load.nxv1i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @load.nxv1i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @load.nxv1i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <vscale x 1 x i32>, ptr [[P:%.*]], align 4
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP7]], align 4
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @load.nxv1i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i32>, ptr [[P:%.*]], align 4
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP4]], align 4
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; ORIGINS-NEXT:    ret void
+;
+  load <vscale x 1 x i32>, ptr %p
+  ret void
+}
+
+define void @load.nxv2i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @load.nxv2i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i32>, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @load.nxv2i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <vscale x 2 x i32>, ptr [[P:%.*]], align 8
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP7]], align 8
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @load.nxv2i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i32>, ptr [[P:%.*]], align 8
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP4]], align 8
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 8
+; ORIGINS-NEXT:    ret void
+;
+  load <vscale x 2 x i32>, ptr %p
+  ret void
+}
+
+define void @load.nxv4i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @load.nxv4i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 16
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @load.nxv4i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 16
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @load.nxv4i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 16
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 16
+; ORIGINS-NEXT:    ret void
+;
+  load <vscale x 4 x i32>, ptr %p
+  ret void
+}
+
+define void @load.nxv8i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @load.nxv8i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i32>, ptr [[P:%.*]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 32
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @load.nxv8i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <vscale x 8 x i32>, ptr [[P:%.*]], align 32
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <vscale x 8 x i32>, ptr [[TMP7]], align 32
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @load.nxv8i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i32>, ptr [[P:%.*]], align 32
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 32
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 32
+; ORIGINS-NEXT:    ret void
+;
+  load <vscale x 8 x i32>, ptr %p
+  ret void
+}
+
+define void @load.nxv16i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @load.nxv16i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr [[P:%.*]], align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 16 x i32>, ptr [[TMP4]], align 64
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @load.nxv16i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = load <vscale x 16 x i32>, ptr [[P:%.*]], align 64
+; ADDR-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; ADDR-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ADDR-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ADDR-NEXT:    [[_MSLD:%.*]] = load <vscale x 16 x i32>, ptr [[TMP7]], align 64
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @load.nxv16i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr [[P:%.*]], align 64
+; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
+; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGINS-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGINS-NEXT:    [[_MSLD:%.*]] = load <vscale x 16 x i32>, ptr [[TMP4]], align 64
+; ORIGINS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 64
+; ORIGINS-NEXT:    ret void
+;
+  load <vscale x 16 x i32>, ptr %p
+  ret void
+}
+
+
+define void @store.nxv1i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @store.nxv1i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store <vscale x 1 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <vscale x 1 x i32> zeroinitializer, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @store.nxv1i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <vscale x 1 x i32> zeroinitializer, ptr [[TMP6]], align 4
+; ADDR-NEXT:    store <vscale x 1 x i32> zeroinitializer, ptr [[P]], align 4
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @store.nxv1i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ORIGINS-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; ORIGINS-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; ORIGINS-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; ORIGINS-NEXT:    store <vscale x 1 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; ORIGINS-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> zeroinitializer)
+; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
+; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0:![0-9]+]]
+; ORIGINS:       7:
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 4
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGINS:       .split:
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGINS:       .split.split:
+; ORIGINS-NEXT:    br label [[TMP13]]
+; ORIGINS:       13:
+; ORIGINS-NEXT:    store <vscale x 1 x i32> zeroinitializer, ptr [[P]], align 4
+; ORIGINS-NEXT:    ret void
+;
+  store <vscale x 1 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.nxv2i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @store.nxv2i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @store.nxv2i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[TMP6]], align 8
+; ADDR-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[P]], align 8
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @store.nxv2i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ORIGINS-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; ORIGINS-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; ORIGINS-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; ORIGINS-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[TMP3]], align 8
+; ORIGINS-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> zeroinitializer)
+; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
+; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; ORIGINS:       7:
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 8
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGINS:       .split:
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGINS:       .split.split:
+; ORIGINS-NEXT:    br label [[TMP13]]
+; ORIGINS:       13:
+; ORIGINS-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[P]], align 8
+; ORIGINS-NEXT:    ret void
+;
+  store <vscale x 2 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.nxv4i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @store.nxv4i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[TMP3]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @store.nxv4i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[TMP6]], align 16
+; ADDR-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[P]], align 16
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @store.nxv4i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ORIGINS-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; ORIGINS-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; ORIGINS-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; ORIGINS-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[TMP3]], align 16
+; ORIGINS-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> zeroinitializer)
+; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
+; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; ORIGINS:       7:
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 16
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGINS:       .split:
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGINS:       .split.split:
+; ORIGINS-NEXT:    br label [[TMP13]]
+; ORIGINS:       13:
+; ORIGINS-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[P]], align 16
+; ORIGINS-NEXT:    ret void
+;
+  store <vscale x 4 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.nxv8i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @store.nxv8i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store <vscale x 8 x i32> zeroinitializer, ptr [[TMP3]], align 32
+; CHECK-NEXT:    store <vscale x 8 x i32> zeroinitializer, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @store.nxv8i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <vscale x 8 x i32> zeroinitializer, ptr [[TMP6]], align 32
+; ADDR-NEXT:    store <vscale x 8 x i32> zeroinitializer, ptr [[P]], align 32
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @store.nxv8i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ORIGINS-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; ORIGINS-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; ORIGINS-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; ORIGINS-NEXT:    store <vscale x 8 x i32> zeroinitializer, ptr [[TMP3]], align 32
+; ORIGINS-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
+; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; ORIGINS:       7:
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 32
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGINS:       .split:
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGINS:       .split.split:
+; ORIGINS-NEXT:    br label [[TMP13]]
+; ORIGINS:       13:
+; ORIGINS-NEXT:    store <vscale x 8 x i32> zeroinitializer, ptr [[P]], align 32
+; ORIGINS-NEXT:    ret void
+;
+  store <vscale x 8 x i32> zeroinitializer, ptr %p
+  ret void
+}
 
+define void @store.nxv16i32(ptr %p) sanitize_memory {
+; CHECK-LABEL: @store.nxv16i32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store <vscale x 16 x i32> zeroinitializer, ptr [[TMP3]], align 64
+; CHECK-NEXT:    store <vscale x 16 x i32> zeroinitializer, ptr [[P]], align 64
+; CHECK-NEXT:    ret void
+;
+; ADDR-LABEL: @store.nxv16i32(
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ADDR-NEXT:    call void @llvm.donothing()
+; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; ADDR-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; ADDR:       2:
+; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; ADDR-NEXT:    unreachable
+; ADDR:       3:
+; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ADDR-NEXT:    store <vscale x 16 x i32> zeroinitializer, ptr [[TMP6]], align 64
+; ADDR-NEXT:    store <vscale x 16 x i32> zeroinitializer, ptr [[P]], align 64
+; ADDR-NEXT:    ret void
+;
+; ORIGINS-LABEL: @store.nxv16i32(
+; ORIGINS-NEXT:    call void @llvm.donothing()
+; ORIGINS-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; ORIGINS-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; ORIGINS-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; ORIGINS-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; ORIGINS-NEXT:    store <vscale x 16 x i32> zeroinitializer, ptr [[TMP3]], align 64
+; ORIGINS-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.nxv16i32(<vscale x 16 x i32> zeroinitializer)
+; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
+; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; ORIGINS:       7:
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 64
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGINS:       .split:
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGINS:       .split.split:
+; ORIGINS-NEXT:    br label [[TMP13]]
+; ORIGINS:       13:
+; ORIGINS-NEXT:    store <vscale x 16 x i32> zeroinitializer, ptr [[P]], align 64
+; ORIGINS-NEXT:    ret void
+;
+  store <vscale x 16 x i32> zeroinitializer, ptr %p
+  ret void
+}

From 82c83d7e41053b72fc0dc84de9b8bee71986ffc3 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Tue, 21 Mar 2023 16:57:43 +0100
Subject: [PATCH 132/208] [Clang] Fix evaluation of parameters of lambda call
 operator attributes

Fix a regresion introduced by D124351.
Attributes of lambda call operator were evaluated in the
context of the closure object type rather than its operator,
causing an assertion failure.

This was because we temporarily switch to the class lambda to
produce the mangling of the lambda, but we stayed in that
context too long.

Reviewed By: eandrews, aaron.ballman

Differential Revision: https://reviews.llvm.org/D146535
---
 clang/lib/Sema/SemaLambda.cpp             |  7 +++----
 clang/test/SemaCXX/lambda-expressions.cpp | 12 +++++++++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index 3a82c7b3e8285..64db9d065f9c6 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -390,6 +390,9 @@ buildTypeForLambdaCallOperator(Sema &S, clang::CXXRecordDecl *Class,
 void Sema::handleLambdaNumbering(
     CXXRecordDecl *Class, CXXMethodDecl *Method,
     std::optional<std::tuple<bool, unsigned, unsigned, Decl *>> Mangling) {
+
+  ContextRAII ManglingContext(*this, Class->getDeclContext());
+
   if (Mangling) {
     bool HasKnownInternalLinkage;
     unsigned ManglingNumber, DeviceManglingNumber;
@@ -1324,8 +1327,6 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro,
       ParamInfo.getDeclSpec().getConstexprSpecifier(),
       IsLambdaStatic ? SC_Static : SC_None, Params, ExplicitResultType);
 
-  ContextRAII ManglingContext(*this, Class->getDeclContext());
-
   CheckCXXDefaultArguments(Method);
 
   // This represents the function body for the lambda function, check if we
@@ -1350,8 +1351,6 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro,
 
   handleLambdaNumbering(Class, Method);
 
-  ManglingContext.pop();
-
   for (auto &&C : LSI->Captures) {
     if (!C.isVariableCapture())
       continue;
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index 84d224fdc835e..67853c991ce53 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=c++14 -Wno-unused-value -fsyntax-only -verify -verify=expected-cxx14 -fblocks %s
-// RUN: %clang_cc1 -std=c++17 -Wno-unused-value -fsyntax-only -verify -fblocks %s
+// RUN: %clang_cc1 -std=c++17 -Wno-unused-value -verify -ast-dump -fblocks %s | FileCheck %s
 
 namespace std { class type_info; };
 
@@ -704,3 +704,13 @@ static_assert([]() constexpr {
 }());
 } // namespace GH60936
 #endif
+
+// Call operator attributes refering to a variable should
+// be properly handled after D124351
+constexpr int i = 2;
+void foo() {
+  (void)[=][[gnu::aligned(i)]] () {}; // expected-warning{{C++2b extension}}
+  // CHECK: AlignedAttr
+  // CHECK-NEXT: ConstantExpr
+  // CHECK-NEXT: value: Int 2
+}

From 61944469625d4b3ba4a87f4f8fffefb73e9f8cdc Mon Sep 17 00:00:00 2001
From: Archibald Elliott <archibald.elliott@arm.com>
Date: Wed, 22 Mar 2023 13:25:08 +0000
Subject: [PATCH 133/208] [AArch64] Add Missing Custom Target Operands

I noticed, when examining the generated Asm Matcher table, that some of
these custom immediate operands are missing, and so we are not parsing
some hint aliases into the correct MCInst.

Where this becomes apparent is when you parse e.g. `hint #7` into an
MCInst - without these cases, it becomes the MCInst `(HINT 17)`, which
will always be printed as `hint #17`. With these cases, it becomes the
MCInst `XPACLRI`, which will be printed as `xpaclri` with pauth, or
`hint #17` without, matching how `xpaclri` is parsed.

We only handle some specific hint aliases in this manner, usually where
these hints have specific effects that need to be modelled for accurate
code-generation. Otherwise, we just use the normal `InstAlias` system
to have the aliases parsed into a `(HINT N)` MCInst.

Differential Revision: https://reviews.llvm.org/D146630
---
 .../AArch64/AsmParser/AArch64AsmParser.cpp    | 17 ++++++--
 .../test/MC/AArch64/armv8.3a-signed-pointer.s | 39 +++++++++++++++++++
 2 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index d5162d39ce43c..b0c554780edfd 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -7642,9 +7642,10 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
       return Match_Success;
     return Match_InvalidOperand;
 
-    // If the kind is a token for a literal immediate, check if our asm
-    // operand matches. This is for InstAliases which have a fixed-value
-    // immediate in the syntax.
+    // If the kind is a token for a literal immediate, check if our asm operand
+    // matches. This is for InstAliases which have a fixed-value immediate in
+    // the asm string, such as hints which are parsed into a specific
+    // instruction definition.
 #define MATCH_HASH(N)                                                          \
   case MCK__HASH_##N:                                                          \
     return MatchesOpImmediate(N);
@@ -7654,10 +7655,20 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     MATCH_HASH(3)
     MATCH_HASH(4)
     MATCH_HASH(6)
+    MATCH_HASH(7)
     MATCH_HASH(8)
+    MATCH_HASH(10)
     MATCH_HASH(12)
+    MATCH_HASH(14)
     MATCH_HASH(16)
     MATCH_HASH(24)
+    MATCH_HASH(25)
+    MATCH_HASH(26)
+    MATCH_HASH(27)
+    MATCH_HASH(28)
+    MATCH_HASH(29)
+    MATCH_HASH(30)
+    MATCH_HASH(31)
     MATCH_HASH(32)
     MATCH_HASH(40)
     MATCH_HASH(48)
diff --git a/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s b/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
index dad4bc667853b..e13b1bf3c98d8 100644
--- a/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
+++ b/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
@@ -96,44 +96,83 @@
 
 // ALL-EMPTY:
 // ALL-EMPTY:
+  hint #25
   paciasp
 // CHECK-NEXT: paciasp        // encoding: [0x3f,0x23,0x03,0xd5]
+// CHECK-NEXT: paciasp        // encoding: [0x3f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #25        // encoding: [0x3f,0x23,0x03,0xd5]
 // NO83-NEXT: hint #25        // encoding: [0x3f,0x23,0x03,0xd5]
+  hint #29
   autiasp
 // CHECK-NEXT: autiasp        // encoding: [0xbf,0x23,0x03,0xd5]
+// CHECK-NEXT: autiasp        // encoding: [0xbf,0x23,0x03,0xd5]
+// NO83-NEXT: hint #29        // encoding: [0xbf,0x23,0x03,0xd5]
 // NO83-NEXT: hint #29        // encoding: [0xbf,0x23,0x03,0xd5]
+  hint #24
   paciaz
 // CHECK-NEXT: paciaz         // encoding: [0x1f,0x23,0x03,0xd5]
+// CHECK-NEXT: paciaz         // encoding: [0x1f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #24        // encoding: [0x1f,0x23,0x03,0xd5]
 // NO83-NEXT: hint #24        // encoding: [0x1f,0x23,0x03,0xd5]
+  hint #28
   autiaz
 // CHECK-NEXT: autiaz         // encoding: [0x9f,0x23,0x03,0xd5]
+// CHECK-NEXT: autiaz         // encoding: [0x9f,0x23,0x03,0xd5]
 // NO83-NEXT: hint #28        // encoding: [0x9f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #28        // encoding: [0x9f,0x23,0x03,0xd5]
+  hint #8
   pacia1716
 // CHECK-NEXT: pacia1716      // encoding: [0x1f,0x21,0x03,0xd5]
+// CHECK-NEXT: pacia1716      // encoding: [0x1f,0x21,0x03,0xd5]
+// NO83-NEXT: hint #8         // encoding: [0x1f,0x21,0x03,0xd5]
 // NO83-NEXT: hint #8         // encoding: [0x1f,0x21,0x03,0xd5]
+  hint #12
   autia1716
 // CHECK-NEXT: autia1716      // encoding: [0x9f,0x21,0x03,0xd5]
+// CHECK-NEXT: autia1716      // encoding: [0x9f,0x21,0x03,0xd5]
+// NO83-NEXT: hint #12        // encoding: [0x9f,0x21,0x03,0xd5]
 // NO83-NEXT: hint #12        // encoding: [0x9f,0x21,0x03,0xd5]
+  hint #27
   pacibsp
 // CHECK-NEXT: pacibsp        // encoding: [0x7f,0x23,0x03,0xd5]
+// CHECK-NEXT: pacibsp        // encoding: [0x7f,0x23,0x03,0xd5]
 // NO83-NEXT: hint #27        // encoding: [0x7f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #27        // encoding: [0x7f,0x23,0x03,0xd5]
+  hint #31
   autibsp
 // CHECK-NEXT: autibsp        // encoding: [0xff,0x23,0x03,0xd5]
+// CHECK-NEXT: autibsp        // encoding: [0xff,0x23,0x03,0xd5]
 // NO83-NEXT: hint #31        // encoding: [0xff,0x23,0x03,0xd5]
+// NO83-NEXT: hint #31        // encoding: [0xff,0x23,0x03,0xd5]
+  hint #26
   pacibz
 // CHECK-NEXT: pacibz         // encoding: [0x5f,0x23,0x03,0xd5]
+// CHECK-NEXT: pacibz         // encoding: [0x5f,0x23,0x03,0xd5]
 // NO83-NEXT: hint #26        // encoding: [0x5f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #26        // encoding: [0x5f,0x23,0x03,0xd5]
+  hint #30
   autibz
 // CHECK-NEXT: autibz         // encoding: [0xdf,0x23,0x03,0xd5]
+// CHECK-NEXT: autibz         // encoding: [0xdf,0x23,0x03,0xd5]
+// NO83-NEXT: hint #30        // encoding: [0xdf,0x23,0x03,0xd5]
 // NO83-NEXT: hint #30        // encoding: [0xdf,0x23,0x03,0xd5]
+  hint #10
   pacib1716
 // CHECK-NEXT: pacib1716      // encoding: [0x5f,0x21,0x03,0xd5]
+// CHECK-NEXT: pacib1716      // encoding: [0x5f,0x21,0x03,0xd5]
+// NO83-NEXT: hint #10        // encoding: [0x5f,0x21,0x03,0xd5]
 // NO83-NEXT: hint #10        // encoding: [0x5f,0x21,0x03,0xd5]
+  hint #14
   autib1716
 // CHECK-NEXT: autib1716      // encoding: [0xdf,0x21,0x03,0xd5]
+// CHECK-NEXT: autib1716      // encoding: [0xdf,0x21,0x03,0xd5]
 // NO83-NEXT: hint #14        // encoding: [0xdf,0x21,0x03,0xd5]
+// NO83-NEXT: hint #14        // encoding: [0xdf,0x21,0x03,0xd5]
+  hint #7
   xpaclri
 // CHECK-NEXT: xpaclri        // encoding: [0xff,0x20,0x03,0xd5]
+// CHECK-NEXT: xpaclri        // encoding: [0xff,0x20,0x03,0xd5]
+// NO83-NEXT: hint #7         // encoding: [0xff,0x20,0x03,0xd5]
 // NO83-NEXT: hint #7         // encoding: [0xff,0x20,0x03,0xd5]
 
 // ALL-EMPTY:

From f570bd8f6322fab18df5099786683a813f9e7a08 Mon Sep 17 00:00:00 2001
From: Kirill Stoimenov <kstoimenov@google.com>
Date: Wed, 22 Mar 2023 18:09:00 +0000
Subject: [PATCH 134/208] [HWASAN] Disable unexpected_format_specifier_test
 because HWASAN doesn't provide a printf interceptor

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D146647
---
 compiler-rt/test/sanitizer_common/TestCases/Linux/mprobe.cpp    | 2 +-
 .../TestCases/Linux/unexpected_format_specifier_test.cpp        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/mprobe.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/mprobe.cpp
index 82c0faf0e2add..7633eb4762292 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/mprobe.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/mprobe.cpp
@@ -1,5 +1,5 @@
 // RUN: %clangxx %s -o %t && %run %t 2>&1 | FileCheck %s
-// UNSUPPORTED: android, ubsan
+// UNSUPPORTED: android, hwasan, ubsan
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/unexpected_format_specifier_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/unexpected_format_specifier_test.cpp
index 641495508ba10..fdce916ad1e1a 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/unexpected_format_specifier_test.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/unexpected_format_specifier_test.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang -w -O0 %s -o %t && %run %t 2>&1 | FileCheck %s
+// UNSUPPORTED: hwasan
 // UNSUPPORTED: lsan
 // UNSUPPORTED: msan
 // UNSUPPORTED: ubsan

From 5eb9acf9be3cee01ea95448fa8b1e00e3c01868a Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 23 Mar 2023 08:09:32 -0700
Subject: [PATCH 135/208] [HWASAN] Instrument scalable load/store without
 crashing

We can simply push them down the existing call slowpath with some minor changes to how we compute the size argument.
---
 .../Instrumentation/HWAddressSanitizer.cpp    |   6 +-
 .../HWAddressSanitizer/vector-load-store.ll   | 272 ++++++++++++++++++
 2 files changed, 276 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/vector-load-store.ll

diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index ca498d08422f7..f98cb67481154 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -964,7 +964,7 @@ bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) {
     return false; // FIXME
 
   IRBuilder<> IRB(O.getInsn());
-  if (isPowerOf2_64(O.TypeStoreSize) &&
+  if (!O.TypeStoreSize.isScalable() && isPowerOf2_64(O.TypeStoreSize) &&
       (O.TypeStoreSize / 8 <= (1ULL << (kNumberOfAccessSizes - 1))) &&
       (!O.Alignment || *O.Alignment >= Mapping.getObjectAlignment() ||
        *O.Alignment >= O.TypeStoreSize / 8)) {
@@ -980,7 +980,9 @@ bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) {
   } else {
     IRB.CreateCall(HwasanMemoryAccessCallbackSized[O.IsWrite],
                    {IRB.CreatePointerCast(Addr, IntptrTy),
-                    ConstantInt::get(IntptrTy, O.TypeStoreSize / 8)});
+                    IRB.CreateUDiv(IRB.CreateTypeSize(IntptrTy,
+                                                      O.TypeStoreSize),
+                                   ConstantInt::get(IntptrTy, 8))});
   }
   untagPointerOperand(O.getInsn(), Addr);
 
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/vector-load-store.ll b/llvm/test/Instrumentation/HWAddressSanitizer/vector-load-store.ll
new file mode 100644
index 0000000000000..5312c7cc7336d
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/vector-load-store.ll
@@ -0,0 +1,272 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=hwasan -S | FileCheck %s
+
+target triple = "aarch64--linux-android10000"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @load.v1i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.v1i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[P:%.*]], i32 2)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i32>, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+  load <1 x i32>, ptr %p
+  ret void
+}
+
+define void @load.v2i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.v2i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[P:%.*]], i32 3)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  load <2 x i32>, ptr %p
+  ret void
+}
+
+define void @load.v4i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.v4i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[P:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  load <4 x i32>, ptr %p
+  ret void
+}
+
+define void @load.v8i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.v8i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 32)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  load <8 x i32>, ptr %p
+  ret void
+}
+
+define void @load.v16i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.v16i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr [[P]], align 64
+; CHECK-NEXT:    ret void
+;
+  load <16 x i32>, ptr %p
+  ret void
+}
+
+
+define void @store.v1i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.v1i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[P:%.*]], i32 18)
+; CHECK-NEXT:    store <1 x i32> zeroinitializer, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+  store <1 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.v2i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.v2i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[P:%.*]], i32 19)
+; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  store <2 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.v4i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.v4i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[P:%.*]], i32 20)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  store <4 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.v8i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.v8i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 32)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  store <8 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.v16i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.v16i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 64)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr [[P]], align 64
+; CHECK-NEXT:    ret void
+;
+  store <16 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+
+define void @load.nxv1i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.nxv1i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 1 x i32>, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+  load <vscale x 1 x i32>, ptr %p
+  ret void
+}
+
+define void @load.nxv2i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.nxv2i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x i32>, ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  load <vscale x 2 x i32>, ptr %p
+  ret void
+}
+
+define void @load.nxv4i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.nxv4i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 128
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  load <vscale x 4 x i32>, ptr %p
+  ret void
+}
+
+define void @load.nxv8i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.nxv8i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 256
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 8 x i32>, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  load <vscale x 8 x i32>, ptr %p
+  ret void
+}
+
+define void @load.nxv16i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @load.nxv16i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 512
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 16 x i32>, ptr [[P]], align 64
+; CHECK-NEXT:    ret void
+;
+  load <vscale x 16 x i32>, ptr %p
+  ret void
+}
+
+
+define void @store.nxv1i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.nxv1i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    store <vscale x 1 x i32> zeroinitializer, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+  store <vscale x 1 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.nxv2i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.nxv2i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  store <vscale x 2 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.nxv4i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.nxv4i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 128
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  store <vscale x 4 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.nxv8i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.nxv8i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 256
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    store <vscale x 8 x i32> zeroinitializer, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  store <vscale x 8 x i32> zeroinitializer, ptr %p
+  ret void
+}
+
+define void @store.nxv16i32(ptr %p) sanitize_hwaddress {
+; CHECK-LABEL: @store.nxv16i32(
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 512
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
+; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
+; CHECK-NEXT:    store <vscale x 16 x i32> zeroinitializer, ptr [[P]], align 64
+; CHECK-NEXT:    ret void
+;
+  store <vscale x 16 x i32> zeroinitializer, ptr %p
+  ret void
+}

From fd1850b36158eaee5a2d577adc5872ab58362669 Mon Sep 17 00:00:00 2001
From: Job Noorman <jnoorman@igalia.com>
Date: Thu, 23 Mar 2023 16:25:53 +0100
Subject: [PATCH 136/208] Test commit to see if write access works


From 85faee69928b1eeb74a0d74f374a1c74ddf236dd Mon Sep 17 00:00:00 2001
From: Jan Sjodin <jan_sjodin@yahoo.com>
Date: Tue, 21 Mar 2023 13:38:54 -0400
Subject: [PATCH 137/208] [OpenMP][OMPIRBuilder] Make OffloadEntriesInfoManager
 a member of OpenMPIRBuilder

This patch adds the OffloadEntriesInfoManager to the OpenMPIRBuilder, and
allows the OffloadEntriesInfoManager to access the Configuration in the
OpenMPIRBuilder.  With the shared Config there is no risk for inconsistencies,
and there is no longer the need for clang to have a separate
OffloadEntriesInfoManager.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D146549
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  39 +-
 clang/lib/CodeGen/CGOpenMPRuntime.h           |   3 -
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |   1 -
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       | 511 +++++++++---------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  38 +-
 .../Frontend/OpenMPIRBuilderTest.cpp          |   5 +-
 6 files changed, 291 insertions(+), 306 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 58a95d64ac50e..76d0b92796bc5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1054,7 +1054,7 @@ static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
 }
 
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
-    : CGM(CGM), OMPBuilder(CGM.getModule()), OffloadEntriesInfoManager() {
+    : CGM(CGM), OMPBuilder(CGM.getModule()) {
   KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
   llvm::OpenMPIRBuilderConfig Config(CGM.getLangOpts().OpenMPIsDevice, false,
                                      hasRequiresUnifiedSharedMemory(),
@@ -1062,7 +1062,6 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
   // Initialize Types used in OpenMPIRBuilder from OMPKinds.def
   OMPBuilder.initialize();
   OMPBuilder.setConfig(Config);
-  OffloadEntriesInfoManager.setConfig(Config);
   loadOffloadInfoMetadata();
 }
 
@@ -1852,7 +1851,7 @@ bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
   auto EntryInfo =
       getTargetEntryUniqueInfo(CGM.getContext(), Loc, VD->getName());
   SmallString<128> Buffer, Out;
-  OffloadEntriesInfoManager.getTargetRegionEntryFnName(Buffer, EntryInfo);
+  OMPBuilder.OffloadInfoManager.getTargetRegionEntryFnName(Buffer, EntryInfo);
 
   const Expr *Init = VD->getAnyInitializer();
   if (CGM.getLangOpts().CPlusPlus && PerformInit) {
@@ -1900,7 +1899,7 @@ bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
     Out.clear();
     auto CtorEntryInfo = EntryInfo;
     CtorEntryInfo.ParentName = Twine(Buffer, "_ctor").toStringRef(Out);
-    OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
+    OMPBuilder.OffloadInfoManager.registerTargetRegionEntryInfo(
         CtorEntryInfo, Ctor, ID,
         llvm::OffloadEntriesInfoManager::OMPTargetRegionEntryCtor);
   }
@@ -1949,7 +1948,7 @@ bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
     Out.clear();
     auto DtorEntryInfo = EntryInfo;
     DtorEntryInfo.ParentName = Twine(Buffer, "_dtor").toStringRef(Out);
-    OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
+    OMPBuilder.OffloadInfoManager.registerTargetRegionEntryInfo(
         DtorEntryInfo, Dtor, ID,
         llvm::OffloadEntriesInfoManager::OMPTargetRegionEntryDtor);
   }
@@ -2942,7 +2941,7 @@ enum KmpTaskTFields {
 void CGOpenMPRuntime::createOffloadEntriesAndInfoMetadata() {
   // If we are in simd mode or there are no entries, we don't need to do
   // anything.
-  if (CGM.getLangOpts().OpenMPSimd || OffloadEntriesInfoManager.empty())
+  if (CGM.getLangOpts().OpenMPSimd || OMPBuilder.OffloadInfoManager.empty())
     return;
 
   llvm::OpenMPIRBuilder::EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
@@ -2986,8 +2985,7 @@ void CGOpenMPRuntime::createOffloadEntriesAndInfoMetadata() {
     }
   };
 
-  OMPBuilder.createOffloadEntriesAndInfoMetadata(OffloadEntriesInfoManager,
-                                                 ErrorReportFn);
+  OMPBuilder.createOffloadEntriesAndInfoMetadata(ErrorReportFn);
 }
 
 /// Loads all the offload entries information from the host IR
@@ -3021,7 +3019,7 @@ void CGOpenMPRuntime::loadOffloadInfoMetadata() {
     return;
   }
 
-  OMPBuilder.loadOffloadInfoMetadata(*ME.get(), OffloadEntriesInfoManager);
+  OMPBuilder.loadOffloadInfoMetadata(*ME.get());
 }
 
 void CGOpenMPRuntime::emitKmpRoutineEntryT(QualType KmpInt32Ty) {
@@ -6109,10 +6107,9 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
   getNumTeamsExprForTargetDirective(CGF, D, DefaultValTeams);
   getNumThreadsExprForTargetDirective(CGF, D, DefaultValThreads);
 
-  OMPBuilder.emitTargetRegionFunction(OffloadEntriesInfoManager, EntryInfo,
-                                      GenerateOutlinedFunction, DefaultValTeams,
-                                      DefaultValThreads, IsOffloadEntry,
-                                      OutlinedFn, OutlinedFnID);
+  OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
+                                      DefaultValTeams, DefaultValThreads,
+                                      IsOffloadEntry, OutlinedFn, OutlinedFnID);
 
   if (OutlinedFn != nullptr)
     CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM);
@@ -10136,7 +10133,7 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
 
     // Is this a target region that should not be emitted as an entry point? If
     // so just signal we are done with this target region.
-    if (!OffloadEntriesInfoManager.hasTargetRegionEntryInfo(EntryInfo))
+    if (!OMPBuilder.OffloadInfoManager.hasTargetRegionEntryInfo(EntryInfo))
       return;
 
     switch (E.getDirectiveKind()) {
@@ -10392,7 +10389,7 @@ void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD,
          Linkage == llvm::GlobalValue::LinkOnceODRLinkage)) {
       // Do not create a "ref-variable" if the original is not also available
       // on the host.
-      if (!OffloadEntriesInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
+      if (!OMPBuilder.OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
         return;
       std::string RefName = getName({VarName, "ref"});
       if (!CGM.GetGlobalValue(RefName)) {
@@ -10427,7 +10424,7 @@ void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD,
     Linkage = llvm::GlobalValue::WeakAnyLinkage;
   }
 
-  OffloadEntriesInfoManager.registerDeviceGlobalVarEntryInfo(
+  OMPBuilder.OffloadInfoManager.registerDeviceGlobalVarEntryInfo(
       VarName, Addr, VarSize, Flags, Linkage);
 }
 
@@ -10562,9 +10559,8 @@ llvm::Function *CGOpenMPRuntime::emitRequiresDirectiveRegFun() {
   // don't need to do anything.
   if (CGM.getLangOpts().OMPTargetTriples.empty() ||
       CGM.getLangOpts().OpenMPSimd || CGM.getLangOpts().OpenMPIsDevice ||
-      (OffloadEntriesInfoManager.empty() &&
-       !HasEmittedDeclareTargetRegion &&
-       !HasEmittedTargetRegion))
+      (OMPBuilder.OffloadInfoManager.empty() &&
+       !HasEmittedDeclareTargetRegion && !HasEmittedTargetRegion))
     return nullptr;
 
   // Create and register the function that handles the requires directives.
@@ -10585,9 +10581,8 @@ llvm::Function *CGOpenMPRuntime::emitRequiresDirectiveRegFun() {
     // passed to the runtime. This avoids the runtime from throwing an error
     // for mismatching requires clauses across compilation units that don't
     // contain at least 1 target region.
-    assert((HasEmittedTargetRegion ||
-            HasEmittedDeclareTargetRegion ||
-            !OffloadEntriesInfoManager.empty()) &&
+    assert((HasEmittedTargetRegion || HasEmittedDeclareTargetRegion ||
+            !OMPBuilder.OffloadInfoManager.empty()) &&
            "Target or declare target region expected.");
     if (HasRequiresUnifiedSharedMemory)
       Flags = OMP_REQ_UNIFIED_SHARED_MEMORY;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index e7c1a098c7689..c9678a16ce90b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -508,9 +508,6 @@ class CGOpenMPRuntime {
   ///  kmp_int64 st; // stride
   /// };
   QualType KmpDimTy;
-  /// Entity that registers the offloading constants that were emitted so
-  /// far.
-  llvm::OffloadEntriesInfoManager OffloadEntriesInfoManager;
 
   bool ShouldMarkAsGlobal = true;
   /// List of the emitted declarations.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index e8c5f04db49f4..4ac28ee17a50b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -863,7 +863,6 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
                                      hasRequiresUnifiedSharedMemory(),
                                      CGM.getLangOpts().OpenMPOffloadMandatory);
   OMPBuilder.setConfig(Config);
-  OffloadEntriesInfoManager.setConfig(Config);
 
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP can only handle device code.");
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index a13f8528fa6dd..acf91a2d35c41 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -27,6 +27,7 @@ namespace llvm {
 class CanonicalLoopInfo;
 struct TargetRegionEntryInfo;
 class OffloadEntriesInfoManager;
+class OpenMPIRBuilder;
 
 /// Move the instruction after an InsertPoint to the beginning of another
 /// BasicBlock.
@@ -160,6 +161,251 @@ class OpenMPIRBuilderConfig {
   void setSeparator(StringRef S) { Separator = S; }
 };
 
+/// Data structure to contain the information needed to uniquely identify
+/// a target entry.
+struct TargetRegionEntryInfo {
+  std::string ParentName;
+  unsigned DeviceID;
+  unsigned FileID;
+  unsigned Line;
+  unsigned Count;
+
+  TargetRegionEntryInfo()
+      : ParentName(""), DeviceID(0), FileID(0), Line(0), Count(0) {}
+  TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
+                        unsigned FileID, unsigned Line, unsigned Count = 0)
+      : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
+        Count(Count) {}
+
+  static void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
+                                         StringRef ParentName,
+                                         unsigned DeviceID, unsigned FileID,
+                                         unsigned Line, unsigned Count);
+
+  bool operator<(const TargetRegionEntryInfo RHS) const {
+    return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
+           std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
+                           RHS.Count);
+  }
+};
+
+/// Class that manages information about offload code regions and data
+class OffloadEntriesInfoManager {
+  /// Number of entries registered so far.
+  OpenMPIRBuilder *OMPBuilder;
+  unsigned OffloadingEntriesNum = 0;
+
+public:
+  /// Base class of the entries info.
+  class OffloadEntryInfo {
+  public:
+    /// Kind of a given entry.
+    enum OffloadingEntryInfoKinds : unsigned {
+      /// Entry is a target region.
+      OffloadingEntryInfoTargetRegion = 0,
+      /// Entry is a declare target variable.
+      OffloadingEntryInfoDeviceGlobalVar = 1,
+      /// Invalid entry info.
+      OffloadingEntryInfoInvalid = ~0u
+    };
+
+  protected:
+    OffloadEntryInfo() = delete;
+    explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
+    explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
+                              uint32_t Flags)
+        : Flags(Flags), Order(Order), Kind(Kind) {}
+    ~OffloadEntryInfo() = default;
+
+  public:
+    bool isValid() const { return Order != ~0u; }
+    unsigned getOrder() const { return Order; }
+    OffloadingEntryInfoKinds getKind() const { return Kind; }
+    uint32_t getFlags() const { return Flags; }
+    void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
+    Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
+    void setAddress(Constant *V) {
+      assert(!Addr.pointsToAliveValue() && "Address has been set before!");
+      Addr = V;
+    }
+    static bool classof(const OffloadEntryInfo *Info) { return true; }
+
+  private:
+    /// Address of the entity that has to be mapped for offloading.
+    WeakTrackingVH Addr;
+
+    /// Flags associated with the device global.
+    uint32_t Flags = 0u;
+
+    /// Order this entry was emitted.
+    unsigned Order = ~0u;
+
+    OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
+  };
+
+  /// Return true if a there are no entries defined.
+  bool empty() const;
+  /// Return number of entries defined so far.
+  unsigned size() const { return OffloadingEntriesNum; }
+
+  OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
+
+  //
+  // Target region entries related.
+  //
+
+  /// Kind of the target registry entry.
+  enum OMPTargetRegionEntryKind : uint32_t {
+    /// Mark the entry as target region.
+    OMPTargetRegionEntryTargetRegion = 0x0,
+    /// Mark the entry as a global constructor.
+    OMPTargetRegionEntryCtor = 0x02,
+    /// Mark the entry as a global destructor.
+    OMPTargetRegionEntryDtor = 0x04,
+  };
+
+  /// Target region entries info.
+  class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
+    /// Address that can be used as the ID of the entry.
+    Constant *ID = nullptr;
+
+  public:
+    OffloadEntryInfoTargetRegion()
+        : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
+    explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
+                                          Constant *ID,
+                                          OMPTargetRegionEntryKind Flags)
+        : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
+          ID(ID) {
+      setAddress(Addr);
+    }
+
+    Constant *getID() const { return ID; }
+    void setID(Constant *V) {
+      assert(!ID && "ID has been set before!");
+      ID = V;
+    }
+    static bool classof(const OffloadEntryInfo *Info) {
+      return Info->getKind() == OffloadingEntryInfoTargetRegion;
+    }
+  };
+
+  /// Initialize target region entry.
+  /// This is ONLY needed for DEVICE compilation.
+  void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
+                                       unsigned Order);
+  /// Register target region entry.
+  void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
+                                     Constant *Addr, Constant *ID,
+                                     OMPTargetRegionEntryKind Flags);
+  /// Return true if a target region entry with the provided information
+  /// exists.
+  bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
+                                bool IgnoreAddressId = false) const;
+
+  // Return the Name based on \a EntryInfo using the next available Count.
+  void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
+                                  const TargetRegionEntryInfo &EntryInfo);
+
+  /// brief Applies action \a Action on all registered entries.
+  typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
+                            const OffloadEntryInfoTargetRegion &)>
+      OffloadTargetRegionEntryInfoActTy;
+  void
+  actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
+
+  //
+  // Device global variable entries related.
+  //
+
+  /// Kind of the global variable entry..
+  enum OMPTargetGlobalVarEntryKind : uint32_t {
+    /// Mark the entry as a to declare target.
+    OMPTargetGlobalVarEntryTo = 0x0,
+    /// Mark the entry as a to declare target link.
+    OMPTargetGlobalVarEntryLink = 0x1,
+  };
+
+  /// Device global variable entries info.
+  class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
+    /// Type of the global variable.
+    int64_t VarSize;
+    GlobalValue::LinkageTypes Linkage;
+
+  public:
+    OffloadEntryInfoDeviceGlobalVar()
+        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
+    explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
+                                             OMPTargetGlobalVarEntryKind Flags)
+        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
+    explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
+                                             int64_t VarSize,
+                                             OMPTargetGlobalVarEntryKind Flags,
+                                             GlobalValue::LinkageTypes Linkage)
+        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
+          VarSize(VarSize), Linkage(Linkage) {
+      setAddress(Addr);
+    }
+
+    int64_t getVarSize() const { return VarSize; }
+    void setVarSize(int64_t Size) { VarSize = Size; }
+    GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
+    void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
+    static bool classof(const OffloadEntryInfo *Info) {
+      return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
+    }
+  };
+
+  /// Initialize device global variable entry.
+  /// This is ONLY used for DEVICE compilation.
+  void initializeDeviceGlobalVarEntryInfo(StringRef Name,
+                                          OMPTargetGlobalVarEntryKind Flags,
+                                          unsigned Order);
+
+  /// Register device global variable entry.
+  void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr,
+                                        int64_t VarSize,
+                                        OMPTargetGlobalVarEntryKind Flags,
+                                        GlobalValue::LinkageTypes Linkage);
+  /// Checks if the variable with the given name has been registered already.
+  bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
+    return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
+  }
+  /// Applies action \a Action on all registered entries.
+  typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
+      OffloadDeviceGlobalVarEntryInfoActTy;
+  void actOnDeviceGlobalVarEntriesInfo(
+      const OffloadDeviceGlobalVarEntryInfoActTy &Action);
+
+private:
+  /// Return the count of entries at a particular source location.
+  unsigned
+  getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
+
+  /// Update the count of entries at a particular source location.
+  void
+  incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
+
+  static TargetRegionEntryInfo
+  getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
+    return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
+                                 EntryInfo.FileID, EntryInfo.Line, 0);
+  }
+
+  // Count of entries at a location.
+  std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
+
+  // Storage for target region entries kind.
+  typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
+      OffloadEntriesTargetRegionTy;
+  OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
+  /// Storage for device global variable entries kind. The storage is to be
+  /// indexed by mangled name.
+  typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
+      OffloadEntriesDeviceGlobalVarTy;
+  OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
+};
+
 /// An interface to create LLVM-IR for OpenMP directives.
 ///
 /// Each OpenMP directive has a corresponding public generator method.
@@ -167,7 +413,8 @@ class OpenMPIRBuilder {
 public:
   /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
   /// not have an effect on \p M (see initialize)
-  OpenMPIRBuilder(Module &M) : M(M), Builder(M.getContext()) {}
+  OpenMPIRBuilder(Module &M)
+      : M(M), Builder(M.getContext()), OffloadInfoManager(this) {}
   ~OpenMPIRBuilder();
 
   /// Initialize the internal state, this will put structures types and
@@ -1063,6 +1310,9 @@ class OpenMPIRBuilder {
   /// Map to remember existing ident_t*.
   DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
 
+  /// Info manager to keep track of target regions.
+  OffloadEntriesInfoManager OffloadInfoManager;
+
   /// Helper that contains information about regions we need to outline
   /// during finalization.
   struct OutlineInfo {
@@ -1231,7 +1481,6 @@ class OpenMPIRBuilder {
   //
   // We only generate metadata for function that contain target regions.
   void createOffloadEntriesAndInfoMetadata(
-      OffloadEntriesInfoManager &OffloadEntriesInfoManager,
       EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
 
 public:
@@ -1531,8 +1780,7 @@ class OpenMPIRBuilder {
   /// \param NumThreads Number default threads
   /// \param OutlinedFunction Pointer to the outlined function
   /// \param EntryFnIDName Name of the ID o be created
-  void emitTargetRegionFunction(OffloadEntriesInfoManager &InfoManager,
-                                TargetRegionEntryInfo &EntryInfo,
+  void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
                                 FunctionGenCallback &GenerateFunctionCallback,
                                 int32_t NumTeams, int32_t NumThreads,
                                 bool IsOffloadEntry, Function *&OutlinedFn,
@@ -1548,8 +1796,7 @@ class OpenMPIRBuilder {
   /// \param EntryFnIDName Name of the ID o be created
   /// \param NumTeams Number default teams
   /// \param NumThreads Number default threads
-  Constant *registerTargetRegionFunction(OffloadEntriesInfoManager &InfoManager,
-                                         TargetRegionEntryInfo &EntryInfo,
+  Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
                                          Function *OutlinedFunction,
                                          StringRef EntryFnName,
                                          StringRef EntryFnIDName,
@@ -1918,10 +2165,7 @@ class OpenMPIRBuilder {
   ///
   /// \param M         Module to load Metadata info from. Module passed maybe
   /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
-  /// \param OffloadEntriesInfoManager Initialize Offload Entry information.
-  void
-  loadOffloadInfoMetadata(Module &M,
-                          OffloadEntriesInfoManager &OffloadEntriesInfoManager);
+  void loadOffloadInfoMetadata(Module &M);
 
   /// Gets (if variable with the given name already exist) or creates
   /// internal global variable with the specified Name. The created variable has
@@ -1933,253 +2177,6 @@ class OpenMPIRBuilder {
                                               unsigned AddressSpace = 0);
 };
 
-/// Data structure to contain the information needed to uniquely identify
-/// a target entry.
-struct TargetRegionEntryInfo {
-  std::string ParentName;
-  unsigned DeviceID;
-  unsigned FileID;
-  unsigned Line;
-  unsigned Count;
-
-  TargetRegionEntryInfo()
-      : ParentName(""), DeviceID(0), FileID(0), Line(0), Count(0) {}
-  TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
-                        unsigned FileID, unsigned Line, unsigned Count = 0)
-      : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
-        Count(Count) {}
-
-  static void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
-                                         StringRef ParentName,
-                                         unsigned DeviceID, unsigned FileID,
-                                         unsigned Line, unsigned Count);
-
-  bool operator<(const TargetRegionEntryInfo RHS) const {
-    return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
-           std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
-                           RHS.Count);
-  }
-};
-
-/// Class that manages information about offload code regions and data
-class OffloadEntriesInfoManager {
-  /// Number of entries registered so far.
-  OpenMPIRBuilderConfig Config;
-  unsigned OffloadingEntriesNum = 0;
-
-public:
-  void setConfig(OpenMPIRBuilderConfig C) { Config = C; }
-
-  /// Base class of the entries info.
-  class OffloadEntryInfo {
-  public:
-    /// Kind of a given entry.
-    enum OffloadingEntryInfoKinds : unsigned {
-      /// Entry is a target region.
-      OffloadingEntryInfoTargetRegion = 0,
-      /// Entry is a declare target variable.
-      OffloadingEntryInfoDeviceGlobalVar = 1,
-      /// Invalid entry info.
-      OffloadingEntryInfoInvalid = ~0u
-    };
-
-  protected:
-    OffloadEntryInfo() = delete;
-    explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
-    explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
-                              uint32_t Flags)
-        : Flags(Flags), Order(Order), Kind(Kind) {}
-    ~OffloadEntryInfo() = default;
-
-  public:
-    bool isValid() const { return Order != ~0u; }
-    unsigned getOrder() const { return Order; }
-    OffloadingEntryInfoKinds getKind() const { return Kind; }
-    uint32_t getFlags() const { return Flags; }
-    void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
-    Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
-    void setAddress(Constant *V) {
-      assert(!Addr.pointsToAliveValue() && "Address has been set before!");
-      Addr = V;
-    }
-    static bool classof(const OffloadEntryInfo *Info) { return true; }
-
-  private:
-    /// Address of the entity that has to be mapped for offloading.
-    WeakTrackingVH Addr;
-
-    /// Flags associated with the device global.
-    uint32_t Flags = 0u;
-
-    /// Order this entry was emitted.
-    unsigned Order = ~0u;
-
-    OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
-  };
-
-  /// Return true if a there are no entries defined.
-  bool empty() const;
-  /// Return number of entries defined so far.
-  unsigned size() const { return OffloadingEntriesNum; }
-
-  OffloadEntriesInfoManager() : Config() {}
-
-  //
-  // Target region entries related.
-  //
-
-  /// Kind of the target registry entry.
-  enum OMPTargetRegionEntryKind : uint32_t {
-    /// Mark the entry as target region.
-    OMPTargetRegionEntryTargetRegion = 0x0,
-    /// Mark the entry as a global constructor.
-    OMPTargetRegionEntryCtor = 0x02,
-    /// Mark the entry as a global destructor.
-    OMPTargetRegionEntryDtor = 0x04,
-  };
-
-  /// Target region entries info.
-  class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
-    /// Address that can be used as the ID of the entry.
-    Constant *ID = nullptr;
-
-  public:
-    OffloadEntryInfoTargetRegion()
-        : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
-    explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
-                                          Constant *ID,
-                                          OMPTargetRegionEntryKind Flags)
-        : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
-          ID(ID) {
-      setAddress(Addr);
-    }
-
-    Constant *getID() const { return ID; }
-    void setID(Constant *V) {
-      assert(!ID && "ID has been set before!");
-      ID = V;
-    }
-    static bool classof(const OffloadEntryInfo *Info) {
-      return Info->getKind() == OffloadingEntryInfoTargetRegion;
-    }
-  };
-
-  /// Initialize target region entry.
-  /// This is ONLY needed for DEVICE compilation.
-  void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
-                                       unsigned Order);
-  /// Register target region entry.
-  void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
-                                     Constant *Addr, Constant *ID,
-                                     OMPTargetRegionEntryKind Flags);
-  /// Return true if a target region entry with the provided information
-  /// exists.
-  bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
-                                bool IgnoreAddressId = false) const;
-
-  // Return the Name based on \a EntryInfo using the next available Count.
-  void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
-                                  const TargetRegionEntryInfo &EntryInfo);
-
-  /// brief Applies action \a Action on all registered entries.
-  typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
-                            const OffloadEntryInfoTargetRegion &)>
-      OffloadTargetRegionEntryInfoActTy;
-  void
-  actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
-
-  //
-  // Device global variable entries related.
-  //
-
-  /// Kind of the global variable entry..
-  enum OMPTargetGlobalVarEntryKind : uint32_t {
-    /// Mark the entry as a to declare target.
-    OMPTargetGlobalVarEntryTo = 0x0,
-    /// Mark the entry as a to declare target link.
-    OMPTargetGlobalVarEntryLink = 0x1,
-  };
-
-  /// Device global variable entries info.
-  class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
-    /// Type of the global variable.
-    int64_t VarSize;
-    GlobalValue::LinkageTypes Linkage;
-
-  public:
-    OffloadEntryInfoDeviceGlobalVar()
-        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
-    explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
-                                             OMPTargetGlobalVarEntryKind Flags)
-        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
-    explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
-                                             int64_t VarSize,
-                                             OMPTargetGlobalVarEntryKind Flags,
-                                             GlobalValue::LinkageTypes Linkage)
-        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
-          VarSize(VarSize), Linkage(Linkage) {
-      setAddress(Addr);
-    }
-
-    int64_t getVarSize() const { return VarSize; }
-    void setVarSize(int64_t Size) { VarSize = Size; }
-    GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
-    void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
-    static bool classof(const OffloadEntryInfo *Info) {
-      return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
-    }
-  };
-
-  /// Initialize device global variable entry.
-  /// This is ONLY used for DEVICE compilation.
-  void initializeDeviceGlobalVarEntryInfo(StringRef Name,
-                                          OMPTargetGlobalVarEntryKind Flags,
-                                          unsigned Order);
-
-  /// Register device global variable entry.
-  void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr,
-                                        int64_t VarSize,
-                                        OMPTargetGlobalVarEntryKind Flags,
-                                        GlobalValue::LinkageTypes Linkage);
-  /// Checks if the variable with the given name has been registered already.
-  bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
-    return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
-  }
-  /// Applies action \a Action on all registered entries.
-  typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
-      OffloadDeviceGlobalVarEntryInfoActTy;
-  void actOnDeviceGlobalVarEntriesInfo(
-      const OffloadDeviceGlobalVarEntryInfoActTy &Action);
-
-private:
-  /// Return the count of entries at a particular source location.
-  unsigned
-  getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
-
-  /// Update the count of entries at a particular source location.
-  void
-  incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
-
-  static TargetRegionEntryInfo
-  getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
-    return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
-                                 EntryInfo.FileID, EntryInfo.Line, 0);
-  }
-
-  // Count of entries at a location.
-  std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
-
-  // Storage for target region entries kind.
-  typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
-      OffloadEntriesTargetRegionTy;
-  OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
-  /// Storage for device global variable entries kind. The storage is to be
-  /// indexed by mangled name.
-  typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
-      OffloadEntriesDeviceGlobalVarTy;
-  OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
-};
-
 /// Class to represented the control flow structure of an OpenMP canonical loop.
 ///
 /// The control-flow structure is standardized for easy consumption by
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 81e2904bbd019..12c7c42ac5fe5 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3999,13 +3999,13 @@ Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
 }
 
 void OpenMPIRBuilder::emitTargetRegionFunction(
-    OffloadEntriesInfoManager &InfoManager, TargetRegionEntryInfo &EntryInfo,
+    TargetRegionEntryInfo &EntryInfo,
     FunctionGenCallback &GenerateFunctionCallback, int32_t NumTeams,
     int32_t NumThreads, bool IsOffloadEntry, Function *&OutlinedFn,
     Constant *&OutlinedFnID) {
 
   SmallString<64> EntryFnName;
-  InfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
+  OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
 
   OutlinedFn = Config.isEmbedded() || !Config.openMPOffloadMandatory()
                    ? GenerateFunctionCallback(EntryFnName)
@@ -4023,19 +4023,18 @@ void OpenMPIRBuilder::emitTargetRegionFunction(
           : createPlatformSpecificName({EntryFnName, "region_id"});
 
   OutlinedFnID = registerTargetRegionFunction(
-      InfoManager, EntryInfo, OutlinedFn, EntryFnName, EntryFnIDName, NumTeams,
-      NumThreads);
+      EntryInfo, OutlinedFn, EntryFnName, EntryFnIDName, NumTeams, NumThreads);
 }
 
 Constant *OpenMPIRBuilder::registerTargetRegionFunction(
-    OffloadEntriesInfoManager &InfoManager, TargetRegionEntryInfo &EntryInfo,
-    Function *OutlinedFn, StringRef EntryFnName, StringRef EntryFnIDName,
-    int32_t NumTeams, int32_t NumThreads) {
+    TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
+    StringRef EntryFnName, StringRef EntryFnIDName, int32_t NumTeams,
+    int32_t NumThreads) {
   if (OutlinedFn)
     setOutlinedTargetRegionFunctionAttributes(OutlinedFn, NumTeams, NumThreads);
   auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
   auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
-  InfoManager.registerTargetRegionEntryInfo(
+  OffloadInfoManager.registerTargetRegionEntryInfo(
       EntryInfo, EntryAddr, OutlinedFnID,
       OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
   return OutlinedFnID;
@@ -4897,18 +4896,17 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
 
 // We only generate metadata for function that contain target regions.
 void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
-    OffloadEntriesInfoManager &OffloadEntriesInfoManager,
     EmitMetadataErrorReportFunctionTy &ErrorFn) {
 
   // If there are no entries, we don't need to do anything.
-  if (OffloadEntriesInfoManager.empty())
+  if (OffloadInfoManager.empty())
     return;
 
   LLVMContext &C = M.getContext();
   SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
                         TargetRegionEntryInfo>,
               16>
-      OrderedEntries(OffloadEntriesInfoManager.size());
+      OrderedEntries(OffloadInfoManager.size());
 
   // Auxiliary methods to create metadata values and strings.
   auto &&GetMDInt = [this](unsigned V) {
@@ -4947,8 +4945,7 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
         MD->addOperand(MDNode::get(C, Ops));
       };
 
-  OffloadEntriesInfoManager.actOnTargetRegionEntriesInfo(
-      TargetRegionMetadataEmitter);
+  OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
 
   // Create function that emits metadata for each device global variable entry;
   auto &&DeviceGlobalVarMetadataEmitter =
@@ -4973,7 +4970,7 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
         MD->addOperand(MDNode::get(C, Ops));
       };
 
-  OffloadEntriesInfoManager.actOnDeviceGlobalVarEntriesInfo(
+  OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
       DeviceGlobalVarMetadataEmitter);
 
   for (const auto &E : OrderedEntries) {
@@ -5061,8 +5058,7 @@ void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
 
 /// Loads all the offload entries information from the host IR
 /// metadata.
-void OpenMPIRBuilder::loadOffloadInfoMetadata(
-    Module &M, OffloadEntriesInfoManager &OffloadEntriesInfoManager) {
+void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
   // If we are in target mode, load the metadata from the host IR. This code has
   // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
 
@@ -5092,13 +5088,13 @@ void OpenMPIRBuilder::loadOffloadInfoMetadata(
                                       /*FileID=*/GetMDInt(2),
                                       /*Line=*/GetMDInt(4),
                                       /*Count=*/GetMDInt(5));
-      OffloadEntriesInfoManager.initializeTargetRegionEntryInfo(
-          EntryInfo, /*Order=*/GetMDInt(6));
+      OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
+                                                         /*Order=*/GetMDInt(6));
       break;
     }
     case OffloadEntriesInfoManager::OffloadEntryInfo::
         OffloadingEntryInfoDeviceGlobalVar:
-      OffloadEntriesInfoManager.initializeDeviceGlobalVarEntryInfo(
+      OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
           /*MangledName=*/GetMDString(1),
           static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
               /*Flags=*/GetMDInt(2)),
@@ -5147,7 +5143,7 @@ void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
 
   // If we are emitting code for a target, the entry is already initialized,
   // only has to be registered.
-  if (Config.isEmbedded()) {
+  if (OMPBuilder->Config.isEmbedded()) {
     // This could happen if the device compilation is invoked standalone.
     if (!hasTargetRegionEntryInfo(EntryInfo)) {
       return;
@@ -5202,7 +5198,7 @@ void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
 void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
     StringRef VarName, Constant *Addr, int64_t VarSize,
     OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
-  if (Config.isEmbedded()) {
+  if (OMPBuilder->Config.isEmbedded()) {
     // This could happen if the device compilation is invoked standalone.
     if (!hasDeviceGlobalVarEntryInfo(VarName))
       return;
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 05a1d7a58b84d..aee8ed26a6fa6 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -5730,8 +5730,9 @@ TEST_F(OpenMPIRBuilderTest, EmitOffloadingArraysArguments) {
 }
 
 TEST_F(OpenMPIRBuilderTest, OffloadEntriesInfoManager) {
-  OffloadEntriesInfoManager InfoManager;
-  InfoManager.setConfig(OpenMPIRBuilderConfig(true, false, false, false));
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.setConfig(OpenMPIRBuilderConfig(true, false, false, false));
+  OffloadEntriesInfoManager &InfoManager = OMPBuilder.OffloadInfoManager;
   TargetRegionEntryInfo EntryInfo("parent", 1, 2, 4, 0);
   InfoManager.initializeTargetRegionEntryInfo(EntryInfo, 0);
   EXPECT_TRUE(InfoManager.hasTargetRegionEntryInfo(EntryInfo));

From c640a146c4caa3cca559e308e2e7ecc78c45140d Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Thu, 23 Mar 2023 08:18:53 -0400
Subject: [PATCH 138/208] [lldb] Explicitly set libcxx paths when
 USE_SYSTEM_STDLIB is provided

For tests marked as "USE_SYSTEM_STDLIB", the expectation is that the
system's standard library should be used. However, the implementation of
this flag is such that we simply don't pass _any_ libcxxx-related flags
to Clang; in turn, Clang will use its defaults.

For a Clang/Libcxx pair compiled together, Clang defaults to:
1. The headers of the sibling libcxx.
2. The libraries of the system.

This mismatch is actually a bug in the driver; once fixed, however, (2)
would point to the sibling libcxx as well, which is _not_ what test
authors intended with the USE_SYSTEM_STDLIB flag.

As such, this patch explicitly sets a path to the system's libraries.
This change is done only in Apple platforms so that we can test this
works in this case first.

Differential Revision: https://reviews.llvm.org/D146714
---
 .../packages/Python/lldbsuite/test/make/Makefile.rules | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 25c4d88763326..4c225ed360be5 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -428,6 +428,16 @@ ifeq (1,$(USE_LIBCPP))
 	endif
 endif
 
+ifeq (1, $(USE_SYSTEM_STDLIB))
+    ifeq "$(OS)" "Darwin"
+        ifeq "$(SDKROOT)" ""
+             $(error "SDKROOT must be set on Darwin to use the system libcxx")
+        endif
+        CXXFLAGS += -nostdlib++ -nostdinc++ -cxx-isystem $(SDKROOT)/usr/include/c++/v1
+        LDFLAGS += -L$(SDKROOT)/usr/lib -Wl,-rpath,$(SDKROOT)/usr/lib -lc++
+    endif
+endif
+
 # If no explicit request was made, but we have paths to a custom libcxx, use
 # them.
 ifeq ($(or $(USE_LIBSTDCPP), $(USE_LIBCPP), $(USE_SYSTEM_STDLIB)),)

From 2cfd06ba672f4e3097b6c2c576bdb876d37c71d1 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 23 Mar 2023 08:47:44 -0700
Subject: [PATCH 139/208] [BoundsChecking] Don't crash on scalable vector sizes

---
 .../Instrumentation/BoundsChecking.cpp        |  4 +-
 .../Instrumentation/BoundsChecking/simple.ll  | 74 ++++++++++++++++---
 2 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 8b1d39ad412fa..04ffbf6636e1a 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -56,7 +56,7 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
                                  const DataLayout &DL, TargetLibraryInfo &TLI,
                                  ObjectSizeOffsetEvaluator &ObjSizeEval,
                                  BuilderTy &IRB, ScalarEvolution &SE) {
-  uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
+  TypeSize NeededSize = DL.getTypeStoreSize(InstVal->getType());
   LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
                     << " bytes\n");
 
@@ -72,7 +72,7 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
   ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
 
   Type *IntTy = DL.getIntPtrType(Ptr->getType());
-  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+  Value *NeededSizeVal = IRB.CreateTypeSize(IntTy, NeededSize);
 
   auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size));
   auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset));
diff --git a/llvm/test/Instrumentation/BoundsChecking/simple.ll b/llvm/test/Instrumentation/BoundsChecking/simple.ll
index 57858618d17b3..e329b90d0cde4 100644
--- a/llvm/test/Instrumentation/BoundsChecking/simple.ll
+++ b/llvm/test/Instrumentation/BoundsChecking/simple.ll
@@ -33,7 +33,7 @@ define void @f2() nounwind {
 ; CHECK-NEXT:    store i32 3, ptr [[IDX]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = tail call ptr @malloc(i64 32)
@@ -57,7 +57,7 @@ define void @f3(i64 %x) nounwind {
 ; CHECK-NEXT:    store i32 3, ptr [[IDX]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = tail call ptr @calloc(i64 4, i64 %x)
@@ -93,7 +93,7 @@ define void @f4(i64 %x) nounwind {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IDX]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = tail call ptr @realloc(ptr null, i64 %x) nounwind
@@ -115,7 +115,7 @@ define void @f5(i64 %x) nounwind {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[IDX]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %idx = getelementptr inbounds [8 x i8], ptr @.str, i64 0, i64 %x
@@ -137,7 +137,7 @@ define void @f5_as1(i64 %x) nounwind {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(1) [[IDX]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %idx = getelementptr inbounds [8 x i8], ptr addrspace(1) @.str_as1, i64 0, i64 %x
@@ -169,7 +169,7 @@ define void @f7(i64 %x) nounwind {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = alloca i128, i64 %x
@@ -222,7 +222,7 @@ define void @f10(i64 %x, i64 %y) nounwind {
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = alloca i128, i64 %x
@@ -240,7 +240,7 @@ define void @f11(ptr byval(i128) %x) nounwind {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = getelementptr inbounds i8, ptr %x, i64 16
@@ -256,7 +256,7 @@ define void @f11_as1(ptr addrspace(1) byval(i128) %x) nounwind {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = getelementptr inbounds i8, ptr addrspace(1) %x, i16 16
@@ -282,7 +282,7 @@ define i64 @f12(i64 %x, i64 %y) nounwind {
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    ret i64 [[TMP12]]
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = tail call ptr @calloc(i64 1, i64 %x)
@@ -354,7 +354,7 @@ define i8 @f14(i1 %i) {
 ; CHECK-NEXT:    [[RET:%.*]] = load i8, ptr [[P]], align 1
 ; CHECK-NEXT:    ret i8 [[RET]]
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -396,7 +396,7 @@ define i8 @f15(i1 %i) {
 ; CHECK-NEXT:    [[RET:%.*]] = load i8, ptr [[ALLOC]], align 1
 ; CHECK-NEXT:    ret i8 [[RET]]
 ; CHECK:       trap:
-; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5]]
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -414,3 +414,53 @@ bb2:
   %ret = load i8, ptr %alloc
   ret i8 %ret
 }
+
+define <4 x i32> @load_vector(i64 %y) nounwind {
+; CHECK-LABEL: @load_vector(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @calloc(i64 1, i64 256)
+; CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[Y:%.*]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 0, [[DOTIDX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i64 [[Y]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 256, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 256, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[TRAP:%.*]], label [[TMP8:%.*]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
+; CHECK:       trap:
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+;
+  %1 = tail call ptr @calloc(i64 1, i64 256)
+  %2 = getelementptr inbounds i64, ptr %1, i64 %y
+  %3 = load <4 x i32>, ptr %2, align 8
+  ret <4 x i32> %3
+}
+
+define <vscale x 1 x i32> @load_scalable_vector(i64 %y) nounwind {
+; CHECK-LABEL: @load_scalable_vector(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @calloc(i64 1, i64 256)
+; CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[Y:%.*]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 0, [[DOTIDX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i64 [[Y]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 256, [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i64 256, [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[TRAP:%.*]], label [[TMP10:%.*]]
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = load <vscale x 1 x i32>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    ret <vscale x 1 x i32> [[TMP11]]
+; CHECK:       trap:
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+;
+  %1 = tail call ptr @calloc(i64 1, i64 256)
+  %2 = getelementptr inbounds i64, ptr %1, i64 %y
+  %3 = load <vscale x 1 x i32>, ptr %2, align 8
+  ret <vscale x 1 x i32> %3
+}

From 16b6826bdd6e5cc02844c10bd1f9af388a6ffed8 Mon Sep 17 00:00:00 2001
From: Ding Xiang Fei <dingxiangfei2009@protonmail.ch>
Date: Thu, 23 Mar 2023 17:04:21 +0100
Subject: [PATCH 140/208] [MergeFuncs] Add tests for D144682 (NFC)

I forgot to git add this test when committing the change.
---
 .../MergeFunc/mergefunc-preserve-nonnull.ll   | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 llvm/test/Transforms/MergeFunc/mergefunc-preserve-nonnull.ll

diff --git a/llvm/test/Transforms/MergeFunc/mergefunc-preserve-nonnull.ll b/llvm/test/Transforms/MergeFunc/mergefunc-preserve-nonnull.ll
new file mode 100644
index 0000000000000..12bb0e8b38425
--- /dev/null
+++ b/llvm/test/Transforms/MergeFunc/mergefunc-preserve-nonnull.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=mergefunc -S < %s | FileCheck %s
+
+; This test makes sure that the mergefunc pass does not merge functions
+; that have different nonnull assertions.
+
+%1 = type ptr
+
+define void @f1(ptr %0, ptr %1) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP1:%.*]], align 8, !nonnull !0
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %3 = load ptr, ptr %1, align 8, !nonnull !0
+  store ptr %3, ptr %0, align 8
+  ret void
+}
+
+define void @f2(ptr %0, ptr %1) {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP1:%.*]], align 8
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %3 = load ptr, ptr %1, align 8
+  store ptr %3, ptr %0, align 8
+  ret void
+}
+
+define void @f3(ptr %0, ptr %1) {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP1:%.*]], align 8, !noundef !0
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %3 = load ptr, ptr %1, align 8, !noundef !0
+  store ptr %3, ptr %0, align 8
+  ret void
+}
+
+define void @f4(ptr %0, ptr %1) {
+; CHECK-LABEL: @f4(
+; CHECK-NEXT:    tail call void @f3(ptr [[TMP0:%.*]], ptr [[TMP1:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %3 = load ptr, ptr %1, align 8, !noundef !0, !dbg !1
+  store ptr %3, ptr %0, align 8
+  ret void
+}
+
+!0 = !{}
+!1 = !{}

From ec294d2f8f1839e11c13ee32279cd28c1f46f66f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Mar 2023 16:10:32 +0000
Subject: [PATCH 141/208] [X86] LowerVectorAllZero - lower to
 CMP(MOVMSK(NOT(X)),0) instead of CMP(MOVMSK(X),65535)

In most cases the NOT will still be scalarized, but it allows us to perform the CMP(X,0) combines inside combineCMP()
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  3 +-
 llvm/test/CodeGen/X86/pr45378.ll              |  4 +-
 llvm/test/CodeGen/X86/ptest.ll                | 18 ++++----
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 41 +++++++++----------
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2d371566381c8..3a4173e443798 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24224,9 +24224,10 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
   V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
   V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
                   getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+  V = DAG.getNOT(DL, V, MVT::v16i8);
   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
   return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
-                     DAG.getConstant(0xFFFF, DL, MVT::i32));
+                     DAG.getConstant(0, DL, MVT::i32));
 }
 
 // Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll
index fecfa95f8b838..aa870b7afbd38 100644
--- a/llvm/test/CodeGen/X86/pr45378.ll
+++ b/llvm/test/CodeGen/X86/pr45378.ll
@@ -15,7 +15,7 @@ define i1 @parseHeaders(ptr %ptr) nounwind {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -45,7 +45,7 @@ define i1 @parseHeaders2_scalar_or(ptr %ptr) nounwind {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index bedcfebc5f6e7..5983d502af3dd 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -10,7 +10,7 @@ define i32 @veccond128(<4 x i32> %input) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    je .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %if-true-block
 ; SSE2-NEXT:    xorl %eax, %eax
@@ -57,7 +57,7 @@ define i32 @veccond256(<8 x i32> %input) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    je .LBB1_2
 ; SSE2-NEXT:  # %bb.1: # %if-true-block
 ; SSE2-NEXT:    xorl %eax, %eax
@@ -109,7 +109,7 @@ define i32 @veccond512(<16 x i32> %input) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    je .LBB2_2
 ; SSE2-NEXT:  # %bb.1: # %if-true-block
 ; SSE2-NEXT:    xorl %eax, %eax
@@ -176,7 +176,7 @@ define i32 @vectest128(<4 x i32> %input) {
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %ecx
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %ecx # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -207,7 +207,7 @@ define i32 @vectest256(<8 x i32> %input) {
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %ecx
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %ecx # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -242,7 +242,7 @@ define i32 @vectest512(<16 x i32> %input) {
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %ecx
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %ecx # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -286,7 +286,7 @@ define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %ecx
-; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %ecx # imm = 0xFFFF
 ; SSE2-NEXT:    cmovel %esi, %eax
 ; SSE2-NEXT:    retq
 ;
@@ -317,7 +317,7 @@ define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %ecx
-; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %ecx # imm = 0xFFFF
 ; SSE2-NEXT:    cmovel %esi, %eax
 ; SSE2-NEXT:    retq
 ;
@@ -352,7 +352,7 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %ecx
-; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %ecx # imm = 0xFFFF
 ; SSE2-NEXT:    cmovel %esi, %eax
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index a489a5e6099f0..761ad105f75dc 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -16,7 +16,7 @@ define i1 @test_v2i64(<2 x i64> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -43,7 +43,7 @@ define i1 @test_v4i64(<4 x i64> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -74,7 +74,7 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -128,7 +128,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -207,7 +207,7 @@ define i1 @test_v4i32(<4 x i32> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -234,7 +234,7 @@ define i1 @test_v8i32(<8 x i32> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -265,7 +265,7 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -319,7 +319,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -417,7 +417,7 @@ define i1 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -444,7 +444,7 @@ define i1 @test_v16i16(<16 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -475,7 +475,7 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -529,7 +529,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -646,7 +646,7 @@ define i1 @test_v16i8(<16 x i8> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -673,7 +673,7 @@ define i1 @test_v32i8(<32 x i8> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -704,7 +704,7 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -758,7 +758,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -866,7 +866,7 @@ define i1 @mask_v8i32(<8 x i32> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -913,7 +913,7 @@ define i1 @trunc_v16i16(<16 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -964,8 +964,7 @@ define i1 @mask_v128i8(<128 x i8> %a0) {
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    psllw $7, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    testl %eax, %eax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -1026,7 +1025,7 @@ define zeroext i1 @PR44781(ptr %0) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;

From dc2f2d2180f1d1a1835dc55478d3bcceea41a4b1 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Thu, 23 Mar 2023 09:15:57 -0700
Subject: [PATCH 142/208] [MemProf] Use stable_sort to avoid non-determinism

Switch from std::sort to std::stable_sort when sorting callsites to
avoid non-determinism when the comparisons are equal. This showed up in
internal testing of fe27495be2040007c7b20844a9371b06156ab405.
---
 .../IPO/MemProfContextDisambiguation.cpp           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b2fcea1ec8694..762e4ce0c3e79 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1032,13 +1032,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     // latter is so that we can specially handle calls that have identical stack
     // id sequences (either due to cloning or artificially because of the MIB
     // context pruning).
-    std::sort(Calls.begin(), Calls.end(),
-              [](const CallContextInfo &A, const CallContextInfo &B) {
-                auto &IdsA = std::get<1>(A);
-                auto &IdsB = std::get<1>(B);
-                return IdsA.size() > IdsB.size() ||
-                       (IdsA.size() == IdsB.size() && IdsA < IdsB);
-              });
+    std::stable_sort(Calls.begin(), Calls.end(),
+                     [](const CallContextInfo &A, const CallContextInfo &B) {
+                       auto &IdsA = std::get<1>(A);
+                       auto &IdsB = std::get<1>(B);
+                       return IdsA.size() > IdsB.size() ||
+                              (IdsA.size() == IdsB.size() && IdsA < IdsB);
+                     });
 
     // Find the node for the last stack id, which should be the same
     // across all calls recorded for this id, and is the id for this

From 2bececb8bed1f8fcd8d54dba831ceb117717bfcc Mon Sep 17 00:00:00 2001
From: Viktoriia Bakalova <bakalova@google.com>
Date: Tue, 28 Feb 2023 16:27:05 +0000
Subject: [PATCH 143/208] [clangd] Add provider info on symbol hover.

Differential Revision: https://reviews.llvm.org/D144976
---
 clang-tools-extra/clangd/Hover.cpp            |  65 +++++++++++
 clang-tools-extra/clangd/Hover.h              |   3 +
 clang-tools-extra/clangd/IncludeCleaner.cpp   |  77 +++++++------
 clang-tools-extra/clangd/IncludeCleaner.h     |  10 ++
 clang-tools-extra/clangd/Preamble.cpp         |   6 +-
 .../clangd/unittests/HoverTests.cpp           | 104 +++++++++++++++++-
 .../include/clang-include-cleaner/Analysis.h  |  10 ++
 .../include-cleaner/lib/AnalysisInternal.h    |   8 +-
 8 files changed, 229 insertions(+), 54 deletions(-)

diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index c5436141adbf7..e240c22259f35 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -12,11 +12,16 @@
 #include "CodeCompletionStrings.h"
 #include "Config.h"
 #include "FindTarget.h"
+#include "IncludeCleaner.h"
 #include "ParsedAST.h"
 #include "Selection.h"
 #include "SourceCode.h"
+#include "clang-include-cleaner/Analysis.h"
+#include "clang-include-cleaner/Types.h"
 #include "index/SymbolCollector.h"
+#include "support/Logger.h"
 #include "support/Markup.h"
+#include "support/Trace.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/ASTTypeTraits.h"
@@ -43,11 +48,13 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
 #include <string>
+#include <vector>
 
 namespace clang {
 namespace clangd {
@@ -1084,6 +1091,49 @@ const NamedDecl *pickDeclToUse(llvm::ArrayRef<const NamedDecl *> Candidates) {
   return Candidates.front();
 }
 
+void maybeAddSymbolProviders(ParsedAST &AST, HoverInfo &HI,
+                             include_cleaner::Symbol Sym) {
+  trace::Span Tracer("Hover::maybeAddSymbolProviders");
+
+  const SourceManager &SM = AST.getSourceManager();
+  llvm::SmallVector<include_cleaner::Header> RankedProviders =
+      include_cleaner::headersForSymbol(Sym, SM, AST.getPragmaIncludes());
+  if (RankedProviders.empty())
+    return;
+
+  std::string Result;
+  include_cleaner::Includes ConvertedIncludes =
+      convertIncludes(SM, AST.getIncludeStructure().MainFileIncludes);
+  for (const auto &P : RankedProviders) {
+    if (P.kind() == include_cleaner::Header::Physical &&
+        P.physical() == SM.getFileEntryForID(SM.getMainFileID()))
+      // Main file ranked higher than any #include'd file
+      break;
+
+    // Pick the best-ranked #include'd provider
+    auto Matches = ConvertedIncludes.match(P);
+    if (!Matches.empty()) {
+      Result = Matches[0]->quote();
+      break;
+    }
+  }
+
+  if (!Result.empty()) {
+    HI.Provider = std::move(Result);
+    return;
+  }
+
+  // Pick the best-ranked non-#include'd provider
+  const auto &H = RankedProviders.front();
+  if (H.kind() == include_cleaner::Header::Physical &&
+      H.physical() == SM.getFileEntryForID(SM.getMainFileID()))
+    // Do not show main file as provider, otherwise we'll show provider info
+    // on local variables, etc.
+    return;
+
+  HI.Provider = spellHeader(AST, SM.getFileEntryForID(SM.getMainFileID()), H);
+}
+
 } // namespace
 
 std::optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
@@ -1131,6 +1181,12 @@ std::optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
       HighlightRange = Tok.range(SM).toCharRange(SM);
       if (auto M = locateMacroAt(Tok, AST.getPreprocessor())) {
         HI = getHoverContents(*M, Tok, AST);
+        if (auto DefLoc = M->Info->getDefinitionLoc(); DefLoc.isValid()) {
+          include_cleaner::Macro IncludeCleanerMacro{
+              AST.getPreprocessor().getIdentifierInfo(Tok.text(SM)), DefLoc};
+          maybeAddSymbolProviders(AST, *HI,
+                                  include_cleaner::Symbol{IncludeCleanerMacro});
+        }
         break;
       }
     } else if (Tok.kind() == tok::kw_auto || Tok.kind() == tok::kw_decltype) {
@@ -1168,6 +1224,7 @@ std::optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
         if (!HI->Value)
           HI->Value = printExprValue(N, AST.getASTContext());
         maybeAddCalleeArgInfo(N, *HI, PP);
+        maybeAddSymbolProviders(AST, *HI, include_cleaner::Symbol{*DeclToUse});
       } else if (const Expr *E = N->ASTNode.get<Expr>()) {
         HI = getHoverContents(N, E, AST, PP, Index);
       } else if (const Attr *A = N->ASTNode.get<Attr>()) {
@@ -1217,6 +1274,14 @@ markup::Document HoverInfo::present() const {
   assert(!Name.empty() && "hover triggered on a nameless symbol");
   Header.appendCode(Name);
 
+  if (!Provider.empty()) {
+    markup::Paragraph &DI = Output.addParagraph();
+    DI.appendText("provided by");
+    DI.appendSpace();
+    DI.appendCode(Provider);
+    Output.addRuler();
+  }
+
   // Put a linebreak after header to increase readability.
   Output.addRuler();
   // Print Types on their own lines to reduce chances of getting line-wrapped by
diff --git a/clang-tools-extra/clangd/Hover.h b/clang-tools-extra/clangd/Hover.h
index e63ff95b400b3..7ade177f89cc1 100644
--- a/clang-tools-extra/clangd/Hover.h
+++ b/clang-tools-extra/clangd/Hover.h
@@ -14,6 +14,7 @@
 #include "support/Markup.h"
 #include "clang/Index/IndexSymbol.h"
 #include <optional>
+#include <string>
 
 namespace clang {
 namespace clangd {
@@ -67,6 +68,8 @@ struct HoverInfo {
   std::string LocalScope;
   /// Name of the symbol, does not contain any "::".
   std::string Name;
+  /// Header providing the symbol (best match). Contains ""<>.
+  std::string Provider;
   std::optional<Range> SymRange;
   index::SymbolKind Kind = index::SymbolKind::Unknown;
   std::string Documentation;
diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index ee470bd8b963f..ab7c05eb834c0 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -136,45 +136,6 @@ static bool mayConsiderUnused(const Inclusion &Inc, ParsedAST &AST,
   return true;
 }
 
-include_cleaner::Includes
-convertIncludes(const SourceManager &SM,
-                const llvm::ArrayRef<Inclusion> MainFileIncludes) {
-  include_cleaner::Includes Includes;
-  for (const Inclusion &Inc : MainFileIncludes) {
-    include_cleaner::Include TransformedInc;
-    llvm::StringRef WrittenRef = llvm::StringRef(Inc.Written);
-    TransformedInc.Spelled = WrittenRef.trim("\"<>");
-    TransformedInc.HashLocation =
-        SM.getComposedLoc(SM.getMainFileID(), Inc.HashOffset);
-    TransformedInc.Line = Inc.HashLine + 1;
-    TransformedInc.Angled = WrittenRef.starts_with("<");
-    auto FE = SM.getFileManager().getFile(Inc.Resolved);
-    if (!FE) {
-      elog("IncludeCleaner: Failed to get an entry for resolved path {0}: {1}",
-           Inc.Resolved, FE.getError().message());
-      continue;
-    }
-    TransformedInc.Resolved = *FE;
-    Includes.add(std::move(TransformedInc));
-  }
-  return Includes;
-}
-
-std::string spellHeader(ParsedAST &AST, const FileEntry *MainFile,
-                        include_cleaner::Header Provider) {
-  if (Provider.kind() == include_cleaner::Header::Physical) {
-    if (auto CanonicalPath =
-            getCanonicalPath(Provider.physical(), AST.getSourceManager())) {
-      std::string SpelledHeader =
-          llvm::cantFail(URI::includeSpelling(URI::create(*CanonicalPath)));
-      if (!SpelledHeader.empty())
-        return SpelledHeader;
-    }
-  }
-  return include_cleaner::spellHeader(
-      Provider, AST.getPreprocessor().getHeaderSearchInfo(), MainFile);
-}
-
 std::vector<include_cleaner::SymbolReference>
 collectMacroReferences(ParsedAST &AST) {
   const auto &SM = AST.getSourceManager();
@@ -327,6 +288,44 @@ std::vector<Diag> generateUnusedIncludeDiagnostics(
 }
 } // namespace
 
+include_cleaner::Includes
+convertIncludes(const SourceManager &SM,
+                const llvm::ArrayRef<Inclusion> Includes) {
+  include_cleaner::Includes ConvertedIncludes;
+  for (const Inclusion &Inc : Includes) {
+    include_cleaner::Include TransformedInc;
+    llvm::StringRef WrittenRef = llvm::StringRef(Inc.Written);
+    TransformedInc.Spelled = WrittenRef.trim("\"<>");
+    TransformedInc.HashLocation =
+        SM.getComposedLoc(SM.getMainFileID(), Inc.HashOffset);
+    TransformedInc.Line = Inc.HashLine + 1;
+    TransformedInc.Angled = WrittenRef.starts_with("<");
+    auto FE = SM.getFileManager().getFile(Inc.Resolved);
+    if (!FE) {
+      elog("IncludeCleaner: Failed to get an entry for resolved path {0}: {1}",
+           Inc.Resolved, FE.getError().message());
+      continue;
+    }
+    TransformedInc.Resolved = *FE;
+    ConvertedIncludes.add(std::move(TransformedInc));
+  }
+  return ConvertedIncludes;
+}
+
+std::string spellHeader(ParsedAST &AST, const FileEntry *MainFile,
+                        include_cleaner::Header Provider) {
+  if (Provider.kind() == include_cleaner::Header::Physical) {
+    if (auto CanonicalPath =
+            getCanonicalPath(Provider.physical(), AST.getSourceManager())) {
+      std::string SpelledHeader =
+          llvm::cantFail(URI::includeSpelling(URI::create(*CanonicalPath)));
+      if (!SpelledHeader.empty())
+        return SpelledHeader;
+    }
+  }
+  return include_cleaner::spellHeader(
+      Provider, AST.getPreprocessor().getHeaderSearchInfo(), MainFile);
+}
 
 std::vector<const Inclusion *>
 getUnused(ParsedAST &AST,
diff --git a/clang-tools-extra/clangd/IncludeCleaner.h b/clang-tools-extra/clangd/IncludeCleaner.h
index d7edca035c965..1a5f07869d569 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.h
+++ b/clang-tools-extra/clangd/IncludeCleaner.h
@@ -68,6 +68,16 @@ std::vector<Diag> issueIncludeCleanerDiagnostics(ParsedAST &AST,
 /// FIXME: remove this hack once the implementation is good enough.
 void setIncludeCleanerAnalyzesStdlib(bool B);
 
+/// Converts the clangd include representation to include-cleaner
+/// include representation.
+include_cleaner::Includes
+convertIncludes(const SourceManager &SM,
+                const llvm::ArrayRef<Inclusion> Includes);
+
+/// Determines the header spelling of an include-cleaner header
+/// representation. The spelling contains the ""<> characters.
+std::string spellHeader(ParsedAST &AST, const FileEntry *MainFile,
+                        include_cleaner::Header Provider);
 } // namespace clangd
 } // namespace clang
 
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index 061c67d65f7d8..08662697a4a5c 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -135,11 +135,7 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
     SourceMgr = &CI.getSourceManager();
     PP = &CI.getPreprocessor();
     Includes.collect(CI);
-    if (Config::current().Diagnostics.UnusedIncludes ==
-                Config::IncludesPolicy::Strict ||
-            Config::current().Diagnostics.MissingIncludes ==
-            Config::IncludesPolicy::Strict)
-      Pragmas.record(CI);
+    Pragmas.record(CI);
     if (BeforeExecuteCallback)
       BeforeExecuteCallback(CI);
   }
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 211fd1311c98f..6ee9384204036 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -14,11 +14,12 @@
 #include "TestTU.h"
 #include "index/MemIndex.h"
 #include "clang/AST/Attr.h"
+#include "clang/Format/Format.h"
 #include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/StringRef.h"
 
-#include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include <functional>
 #include <string>
 #include <vector>
 
@@ -28,6 +29,10 @@ namespace {
 
 using PassMode = HoverInfo::PassType::PassMode;
 
+std::string guard(llvm::StringRef Code) {
+  return "#pragma once\n" + Code.str();
+}
+
 TEST(Hover, Structured) {
   struct {
     const char *const Code;
@@ -2882,6 +2887,99 @@ TEST(Hover, All) {
   }
 }
 
+TEST(Hover, Providers) {
+  struct {
+    const char *Code;
+    const std::function<void(HoverInfo &)> ExpectedBuilder;
+  } Cases[] = {{R"cpp(
+                            struct Foo {};                     
+                            Foo F = Fo^o{};
+                          )cpp",
+                [](HoverInfo &HI) { HI.Provider = ""; }},
+               {R"cpp(
+                            #include "foo.h"                   
+                            Foo F = Fo^o{};
+                          )cpp",
+                [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
+               {R"cpp(
+                            #include "all.h"  
+                            Foo F = Fo^o{};
+                          )cpp",
+                [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
+               {R"cpp(
+                            #define FOO 5
+                            int F = ^FOO;
+                          )cpp",
+                [](HoverInfo &HI) { HI.Provider = ""; }},
+               {R"cpp(
+                            #include "foo.h"
+                            int F = ^FOO;
+                          )cpp",
+                [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
+               {R"cpp(
+                            #include "all.h"
+                            int F = ^FOO;
+                          )cpp",
+                [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
+               {R"cpp(
+                            #include "foo.h"    
+                            Foo A;
+                            Foo B;
+                            Foo C = A ^+ B;
+                          )cpp",
+                [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
+               // Hover selects the underlying decl of the using decl
+               {R"cpp(
+                            #include "foo.h"
+                            namespace ns {
+                              using ::Foo;
+                            }
+                            ns::F^oo d;
+                          )cpp",
+                [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }}};
+
+  for (const auto &Case : Cases) {
+    Annotations Code{Case.Code};
+    SCOPED_TRACE(Code.code());
+
+    TestTU TU;
+    TU.Filename = "foo.cpp";
+    TU.Code = Code.code();
+    TU.AdditionalFiles["foo.h"] = guard(R"cpp(
+                                                #define FOO 1
+                                                class Foo {};
+                                                Foo& operator+(const Foo, const Foo);
+                                              )cpp");
+    TU.AdditionalFiles["all.h"] = guard("#include \"foo.h\"");
+
+    auto AST = TU.build();
+    auto H = getHover(AST, Code.point(), format::getLLVMStyle(), nullptr);
+    ASSERT_TRUE(H);
+    HoverInfo Expected;
+    Case.ExpectedBuilder(Expected);
+    SCOPED_TRACE(H->present().asMarkdown());
+    EXPECT_EQ(H->Provider, Expected.Provider);
+  }
+}
+
+TEST(Hover, ParseProviderInfo) {
+  HoverInfo HIFoo;
+  HIFoo.Name = "foo";
+  HIFoo.Provider = "\"foo.h\"";
+
+  HoverInfo HIFooBar;
+  HIFooBar.Name = "foo";
+  HIFooBar.Provider = "<bar.h>";
+  struct Case {
+    HoverInfo HI;
+    llvm::StringRef ExpectedMarkdown;
+  } Cases[] = {{HIFoo, "### `foo`  \nprovided by `\"foo.h\"`"},
+               {HIFooBar, "### `foo`  \nprovided by `<bar.h>`"}};
+
+  for (const auto &Case : Cases)
+    EXPECT_EQ(Case.HI.present().asMarkdown(), Case.ExpectedMarkdown);
+}
+
 TEST(Hover, DocsFromIndex) {
   Annotations T(R"cpp(
   template <typename T> class X {};
@@ -3359,8 +3457,8 @@ TEST(Hover, ParseDocumentation) {
   }
 }
 
-// This is a separate test as headings don't create any differences in plaintext
-// mode.
+// This is a separate test as headings don't create any differences in
+// plaintext mode.
 TEST(Hover, PresentHeadings) {
   HoverInfo HI;
   HI.Kind = index::SymbolKind::Variable;
diff --git a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
index cd11700548075..66916a52046cb 100644
--- a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
+++ b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
@@ -16,11 +16,13 @@
 #include "clang/Format/Format.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include <variant>
 
 namespace clang {
 class SourceLocation;
+class SourceManager;
 class Decl;
 class FileEntry;
 class HeaderSearch;
@@ -75,6 +77,14 @@ std::string fixIncludes(const AnalysisResults &Results, llvm::StringRef Code,
 
 std::string spellHeader(const Header &H, HeaderSearch &HS,
                         const FileEntry *Main);
+
+/// Gets all the providers for a symbol by traversing each location.
+/// Returned headers are sorted by relevance, first element is the most
+/// likely provider for the symbol.
+llvm::SmallVector<Header> headersForSymbol(const Symbol &S,
+                                           const SourceManager &SM,
+                                           const PragmaIncludes *PI);
+
 } // namespace include_cleaner
 } // namespace clang
 
diff --git a/clang-tools-extra/include-cleaner/lib/AnalysisInternal.h b/clang-tools-extra/include-cleaner/lib/AnalysisInternal.h
index acf462919344b..6bfed91b584b3 100644
--- a/clang-tools-extra/include-cleaner/lib/AnalysisInternal.h
+++ b/clang-tools-extra/include-cleaner/lib/AnalysisInternal.h
@@ -22,6 +22,7 @@
 #define CLANG_INCLUDE_CLEANER_ANALYSISINTERNAL_H
 
 #include "TypesInternal.h"
+#include "clang-include-cleaner/Analysis.h"
 #include "clang-include-cleaner/Record.h"
 #include "clang-include-cleaner/Types.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
@@ -58,13 +59,6 @@ llvm::SmallVector<Hinted<Header>> findHeaders(const SymbolLocation &Loc,
 /// A set of locations that provides the declaration.
 std::vector<Hinted<SymbolLocation>> locateSymbol(const Symbol &S);
 
-/// Gets all the providers for a symbol by traversing each location.
-/// Returned headers are sorted by relevance, first element is the most
-/// likely provider for the symbol.
-llvm::SmallVector<Header> headersForSymbol(const Symbol &S,
-                                           const SourceManager &SM,
-                                           const PragmaIncludes *PI);
-
 /// Write an HTML summary of the analysis to the given stream.
 void writeHTMLReport(FileID File, const Includes &,
                      llvm::ArrayRef<Decl *> Roots,

From 9c8bdbcbc502fac7d7d8da5c848cec448daf26ae Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Thu, 23 Mar 2023 09:05:34 -0500
Subject: [PATCH 144/208] [libc] Implement memory fences on NVPTX

Memory fences are not handled by the NVPTX backend. We need to replace
them with a memory barrier intrinsic function. This doesn't include the
ordering, but should perform the necessary functionality, albeit slower.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D146725
---
 libc/src/__support/CPP/atomic.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libc/src/__support/CPP/atomic.h b/libc/src/__support/CPP/atomic.h
index b0e90e32dadd2..5514062525cce 100644
--- a/libc/src/__support/CPP/atomic.h
+++ b/libc/src/__support/CPP/atomic.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_SUPPORT_CPP_ATOMIC_H
 
 #include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
 
 #include "type_traits.h"
 
@@ -96,7 +97,14 @@ template <typename T> struct Atomic {
 
 // Issue a thread fence with the given memory ordering.
 LIBC_INLINE void atomic_thread_fence(MemoryOrder mem_ord) {
+// The NVPTX backend currently does not support atomic thread fences so we use a
+// full system fence instead.
+#ifdef LIBC_TARGET_ARCH_IS_NVPTX
+  (void)mem_ord;
+  __nvvm_membar_sys();
+#else
   __atomic_thread_fence(int(mem_ord));
+#endif
 }
 
 } // namespace cpp

From 5525087e4c7bdedea3935fd08ee99ac3ba53a5b2 Mon Sep 17 00:00:00 2001
From: Archibald Elliott <archibald.elliott@arm.com>
Date: Thu, 23 Mar 2023 15:56:07 +0000
Subject: [PATCH 145/208] [NFC][AArch64] Sort Hints in
 armv8.3a-signed-pointer.s test

---
 .../test/MC/AArch64/armv8.3a-signed-pointer.s | 108 +++++++++---------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s b/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
index e13b1bf3c98d8..e95c9309a3d4d 100644
--- a/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
+++ b/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
@@ -96,84 +96,84 @@
 
 // ALL-EMPTY:
 // ALL-EMPTY:
-  hint #25
-  paciasp
-// CHECK-NEXT: paciasp        // encoding: [0x3f,0x23,0x03,0xd5]
-// CHECK-NEXT: paciasp        // encoding: [0x3f,0x23,0x03,0xd5]
-// NO83-NEXT: hint #25        // encoding: [0x3f,0x23,0x03,0xd5]
-// NO83-NEXT: hint #25        // encoding: [0x3f,0x23,0x03,0xd5]
-  hint #29
-  autiasp
-// CHECK-NEXT: autiasp        // encoding: [0xbf,0x23,0x03,0xd5]
-// CHECK-NEXT: autiasp        // encoding: [0xbf,0x23,0x03,0xd5]
-// NO83-NEXT: hint #29        // encoding: [0xbf,0x23,0x03,0xd5]
-// NO83-NEXT: hint #29        // encoding: [0xbf,0x23,0x03,0xd5]
-  hint #24
-  paciaz
-// CHECK-NEXT: paciaz         // encoding: [0x1f,0x23,0x03,0xd5]
-// CHECK-NEXT: paciaz         // encoding: [0x1f,0x23,0x03,0xd5]
-// NO83-NEXT: hint #24        // encoding: [0x1f,0x23,0x03,0xd5]
-// NO83-NEXT: hint #24        // encoding: [0x1f,0x23,0x03,0xd5]
-  hint #28
-  autiaz
-// CHECK-NEXT: autiaz         // encoding: [0x9f,0x23,0x03,0xd5]
-// CHECK-NEXT: autiaz         // encoding: [0x9f,0x23,0x03,0xd5]
-// NO83-NEXT: hint #28        // encoding: [0x9f,0x23,0x03,0xd5]
-// NO83-NEXT: hint #28        // encoding: [0x9f,0x23,0x03,0xd5]
+  hint #7
+  xpaclri
+// CHECK-NEXT: xpaclri        // encoding: [0xff,0x20,0x03,0xd5]
+// CHECK-NEXT: xpaclri        // encoding: [0xff,0x20,0x03,0xd5]
+// NO83-NEXT: hint #7         // encoding: [0xff,0x20,0x03,0xd5]
+// NO83-NEXT: hint #7         // encoding: [0xff,0x20,0x03,0xd5]
   hint #8
   pacia1716
 // CHECK-NEXT: pacia1716      // encoding: [0x1f,0x21,0x03,0xd5]
 // CHECK-NEXT: pacia1716      // encoding: [0x1f,0x21,0x03,0xd5]
 // NO83-NEXT: hint #8         // encoding: [0x1f,0x21,0x03,0xd5]
 // NO83-NEXT: hint #8         // encoding: [0x1f,0x21,0x03,0xd5]
+  hint #10
+  pacib1716
+// CHECK-NEXT: pacib1716      // encoding: [0x5f,0x21,0x03,0xd5]
+// CHECK-NEXT: pacib1716      // encoding: [0x5f,0x21,0x03,0xd5]
+// NO83-NEXT: hint #10        // encoding: [0x5f,0x21,0x03,0xd5]
+// NO83-NEXT: hint #10        // encoding: [0x5f,0x21,0x03,0xd5]
   hint #12
   autia1716
 // CHECK-NEXT: autia1716      // encoding: [0x9f,0x21,0x03,0xd5]
 // CHECK-NEXT: autia1716      // encoding: [0x9f,0x21,0x03,0xd5]
 // NO83-NEXT: hint #12        // encoding: [0x9f,0x21,0x03,0xd5]
 // NO83-NEXT: hint #12        // encoding: [0x9f,0x21,0x03,0xd5]
-  hint #27
-  pacibsp
-// CHECK-NEXT: pacibsp        // encoding: [0x7f,0x23,0x03,0xd5]
-// CHECK-NEXT: pacibsp        // encoding: [0x7f,0x23,0x03,0xd5]
-// NO83-NEXT: hint #27        // encoding: [0x7f,0x23,0x03,0xd5]
-// NO83-NEXT: hint #27        // encoding: [0x7f,0x23,0x03,0xd5]
-  hint #31
-  autibsp
-// CHECK-NEXT: autibsp        // encoding: [0xff,0x23,0x03,0xd5]
-// CHECK-NEXT: autibsp        // encoding: [0xff,0x23,0x03,0xd5]
-// NO83-NEXT: hint #31        // encoding: [0xff,0x23,0x03,0xd5]
-// NO83-NEXT: hint #31        // encoding: [0xff,0x23,0x03,0xd5]
+  hint #14
+  autib1716
+// CHECK-NEXT: autib1716      // encoding: [0xdf,0x21,0x03,0xd5]
+// CHECK-NEXT: autib1716      // encoding: [0xdf,0x21,0x03,0xd5]
+// NO83-NEXT: hint #14        // encoding: [0xdf,0x21,0x03,0xd5]
+// NO83-NEXT: hint #14        // encoding: [0xdf,0x21,0x03,0xd5]
+  hint #24
+  paciaz
+// CHECK-NEXT: paciaz         // encoding: [0x1f,0x23,0x03,0xd5]
+// CHECK-NEXT: paciaz         // encoding: [0x1f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #24        // encoding: [0x1f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #24        // encoding: [0x1f,0x23,0x03,0xd5]
+  hint #25
+  paciasp
+// CHECK-NEXT: paciasp        // encoding: [0x3f,0x23,0x03,0xd5]
+// CHECK-NEXT: paciasp        // encoding: [0x3f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #25        // encoding: [0x3f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #25        // encoding: [0x3f,0x23,0x03,0xd5]
   hint #26
   pacibz
 // CHECK-NEXT: pacibz         // encoding: [0x5f,0x23,0x03,0xd5]
 // CHECK-NEXT: pacibz         // encoding: [0x5f,0x23,0x03,0xd5]
 // NO83-NEXT: hint #26        // encoding: [0x5f,0x23,0x03,0xd5]
 // NO83-NEXT: hint #26        // encoding: [0x5f,0x23,0x03,0xd5]
+  hint #27
+  pacibsp
+// CHECK-NEXT: pacibsp        // encoding: [0x7f,0x23,0x03,0xd5]
+// CHECK-NEXT: pacibsp        // encoding: [0x7f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #27        // encoding: [0x7f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #27        // encoding: [0x7f,0x23,0x03,0xd5]
+  hint #28
+  autiaz
+// CHECK-NEXT: autiaz         // encoding: [0x9f,0x23,0x03,0xd5]
+// CHECK-NEXT: autiaz         // encoding: [0x9f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #28        // encoding: [0x9f,0x23,0x03,0xd5]
+// NO83-NEXT: hint #28        // encoding: [0x9f,0x23,0x03,0xd5]
+  hint #29
+  autiasp
+// CHECK-NEXT: autiasp        // encoding: [0xbf,0x23,0x03,0xd5]
+// CHECK-NEXT: autiasp        // encoding: [0xbf,0x23,0x03,0xd5]
+// NO83-NEXT: hint #29        // encoding: [0xbf,0x23,0x03,0xd5]
+// NO83-NEXT: hint #29        // encoding: [0xbf,0x23,0x03,0xd5]
   hint #30
   autibz
 // CHECK-NEXT: autibz         // encoding: [0xdf,0x23,0x03,0xd5]
 // CHECK-NEXT: autibz         // encoding: [0xdf,0x23,0x03,0xd5]
 // NO83-NEXT: hint #30        // encoding: [0xdf,0x23,0x03,0xd5]
 // NO83-NEXT: hint #30        // encoding: [0xdf,0x23,0x03,0xd5]
-  hint #10
-  pacib1716
-// CHECK-NEXT: pacib1716      // encoding: [0x5f,0x21,0x03,0xd5]
-// CHECK-NEXT: pacib1716      // encoding: [0x5f,0x21,0x03,0xd5]
-// NO83-NEXT: hint #10        // encoding: [0x5f,0x21,0x03,0xd5]
-// NO83-NEXT: hint #10        // encoding: [0x5f,0x21,0x03,0xd5]
-  hint #14
-  autib1716
-// CHECK-NEXT: autib1716      // encoding: [0xdf,0x21,0x03,0xd5]
-// CHECK-NEXT: autib1716      // encoding: [0xdf,0x21,0x03,0xd5]
-// NO83-NEXT: hint #14        // encoding: [0xdf,0x21,0x03,0xd5]
-// NO83-NEXT: hint #14        // encoding: [0xdf,0x21,0x03,0xd5]
-  hint #7
-  xpaclri
-// CHECK-NEXT: xpaclri        // encoding: [0xff,0x20,0x03,0xd5]
-// CHECK-NEXT: xpaclri        // encoding: [0xff,0x20,0x03,0xd5]
-// NO83-NEXT: hint #7         // encoding: [0xff,0x20,0x03,0xd5]
-// NO83-NEXT: hint #7         // encoding: [0xff,0x20,0x03,0xd5]
+  hint #31
+  autibsp
+// CHECK-NEXT: autibsp        // encoding: [0xff,0x23,0x03,0xd5]
+// CHECK-NEXT: autibsp        // encoding: [0xff,0x23,0x03,0xd5]
+// NO83-NEXT: hint #31        // encoding: [0xff,0x23,0x03,0xd5]
+// NO83-NEXT: hint #31        // encoding: [0xff,0x23,0x03,0xd5]
 
 // ALL-EMPTY:
   pacia x0, x1

From e33f8ac9d8b3bd8b376d2306c3988381309b68eb Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Thu, 23 Mar 2023 11:27:20 -0500
Subject: [PATCH 146/208] [libc] Fix inline assembly for nvptx quick_exit

Summary:
The `exit` function in NVPTX has no intrinsic, but the assembly requires
a semicolon in the ptx, otherwise it will fail.
---
 libc/src/__support/OSUtil/gpu/quick_exit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/__support/OSUtil/gpu/quick_exit.cpp b/libc/src/__support/OSUtil/gpu/quick_exit.cpp
index 56f0427c8d81b..3fab438a357a5 100644
--- a/libc/src/__support/OSUtil/gpu/quick_exit.cpp
+++ b/libc/src/__support/OSUtil/gpu/quick_exit.cpp
@@ -27,7 +27,7 @@ void quick_exit(int status) {
       [](rpc::Buffer *) { /* void */ });
 
 #if defined(LIBC_TARGET_ARCH_IS_NVPTX)
-  asm("exit" ::: "memory");
+  asm("exit;" ::: "memory");
 #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
   // This will terminate the entire wavefront, may not be valid with divergent
   // work items.

From 2f5fdbfab8c63047bd4ebef154258868065168b3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 23 Mar 2023 17:32:23 +0100
Subject: [PATCH 147/208] [MergeFunc] Don't assume constant metadata operands

We should not call mdconst::extract, unless we know that the
metadata in question is ConstantAsMetadata.

For now we consider all other metadata as equal. The noalias test
shows that this is not correct, but at least it doesn't crash
anymore.
---
 .../Transforms/Utils/FunctionComparator.h     |  3 +-
 .../Transforms/Utils/FunctionComparator.cpp   | 44 ++++++++++---------
 .../MergeFunc/mergefunc-preserve-nonnull.ll   | 38 +++++++++++++---
 3 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
index 400b9faa94c1b..78761fc78fee8 100644
--- a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -332,7 +332,8 @@ class FunctionComparator {
   int cmpOrderings(AtomicOrdering L, AtomicOrdering R) const;
   int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const;
   int cmpAttrs(const AttributeList L, const AttributeList R) const;
-  int cmpMetadata(const MDNode *L, const MDNode *R) const;
+  int cmpMDNode(const MDNode *L, const MDNode *R) const;
+  int cmpMetadata(const Metadata *L, const Metadata *R) const;
   int cmpInstMetadata(Instruction const *L, Instruction const *R) const;
   int cmpOperandBundlesSchema(const CallBase &LCS, const CallBase &RCS) const;
 
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index af8bc8126160e..7fb6a7415a6fe 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -157,7 +157,25 @@ int FunctionComparator::cmpAttrs(const AttributeList L,
   return 0;
 }
 
-int FunctionComparator::cmpMetadata(const MDNode *L, const MDNode *R) const {
+int FunctionComparator::cmpMetadata(const Metadata *L,
+                                    const Metadata *R) const {
+  // TODO: the following routine coerce the metadata contents into constants
+  // before comparison.
+  // It ignores any other cases, so that the metadata nodes are considered
+  // equal even though this is not correct.
+  // We should structurally compare the metadata nodes to be perfect here.
+  auto *CL = dyn_cast<ConstantAsMetadata>(L);
+  auto *CR = dyn_cast<ConstantAsMetadata>(R);
+  if (CL == CR)
+    return 0;
+  if (!CL)
+    return -1;
+  if (!CR)
+    return 1;
+  return cmpConstants(CL->getValue(), CR->getValue());
+}
+
+int FunctionComparator::cmpMDNode(const MDNode *L, const MDNode *R) const {
   if (L == R)
     return 0;
   if (!L)
@@ -172,23 +190,9 @@ int FunctionComparator::cmpMetadata(const MDNode *L, const MDNode *R) const {
   // function semantically.
   if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
     return Res;
-  for (size_t I = 0; I < L->getNumOperands(); ++I) {
-    // TODO: the following routine coerce the metadata contents into numbers
-    // before comparison.
-    // It ignores any other cases, so that the metadata nodes are considered
-    // equal even though this is not correct.
-    // We should structurally compare the metadata nodes to be perfect here.
-    ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
-    ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
-    if (LLow == RLow)
-      continue;
-    if (!LLow)
-      return -1;
-    if (!RLow)
-      return 1;
-    if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
+  for (size_t I = 0; I < L->getNumOperands(); ++I)
+    if (int Res = cmpMetadata(L->getOperand(I), R->getOperand(I)))
       return Res;
-  }
   return 0;
 }
 
@@ -209,7 +213,7 @@ int FunctionComparator::cmpInstMetadata(Instruction const *L,
     auto const [KeyR, MR] = MDR[I];
     if (int Res = cmpNumbers(KeyL, KeyR))
       return Res;
-    if (int Res = cmpMetadata(ML, MR))
+    if (int Res = cmpMDNode(ML, MR))
       return Res;
   }
   return 0;
@@ -645,8 +649,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
       if (int Res = cmpNumbers(CI->getTailCallKind(),
                                cast<CallInst>(R)->getTailCallKind()))
         return Res;
-    return cmpMetadata(L->getMetadata(LLVMContext::MD_range),
-                       R->getMetadata(LLVMContext::MD_range));
+    return cmpMDNode(L->getMetadata(LLVMContext::MD_range),
+                     R->getMetadata(LLVMContext::MD_range));
   }
   if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
     ArrayRef<unsigned> LIndices = IVI->getIndices();
diff --git a/llvm/test/Transforms/MergeFunc/mergefunc-preserve-nonnull.ll b/llvm/test/Transforms/MergeFunc/mergefunc-preserve-nonnull.ll
index 12bb0e8b38425..3481d53b626fc 100644
--- a/llvm/test/Transforms/MergeFunc/mergefunc-preserve-nonnull.ll
+++ b/llvm/test/Transforms/MergeFunc/mergefunc-preserve-nonnull.ll
@@ -28,8 +28,8 @@ define void @f2(ptr %0, ptr %1) {
   ret void
 }
 
-define void @f3(ptr %0, ptr %1) {
-; CHECK-LABEL: @f3(
+define void @noundef(ptr %0, ptr %1) {
+; CHECK-LABEL: @noundef(
 ; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP1:%.*]], align 8, !noundef !0
 ; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0:%.*]], align 8
 ; CHECK-NEXT:    ret void
@@ -39,9 +39,20 @@ define void @f3(ptr %0, ptr %1) {
   ret void
 }
 
-define void @f4(ptr %0, ptr %1) {
-; CHECK-LABEL: @f4(
-; CHECK-NEXT:    tail call void @f3(ptr [[TMP0:%.*]], ptr [[TMP1:%.*]])
+define void @noalias_1(ptr %0, ptr %1) {
+; CHECK-LABEL: @noalias_1(
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP1:%.*]], align 8, !noalias !1
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0:%.*]], align 8, !alias.scope !1
+; CHECK-NEXT:    ret void
+;
+  %3 = load ptr, ptr %1, align 8, !noalias !4
+  store ptr %3, ptr %0, align 8, !alias.scope !4
+  ret void
+}
+
+define void @noundef_dbg(ptr %0, ptr %1) {
+; CHECK-LABEL: @noundef_dbg(
+; CHECK-NEXT:    tail call void @noundef(ptr [[TMP0:%.*]], ptr [[TMP1:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %3 = load ptr, ptr %1, align 8, !noundef !0, !dbg !1
@@ -49,5 +60,22 @@ define void @f4(ptr %0, ptr %1) {
   ret void
 }
 
+; FIXME: This is merged despite different noalias metadata.
+define void @noalias_2(ptr %0, ptr %1) {
+; CHECK-LABEL: @noalias_2(
+; CHECK-NEXT:    tail call void @noalias_1(ptr [[TMP0:%.*]], ptr [[TMP1:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %3 = load ptr, ptr %1, align 8, !noalias !7
+  store ptr %3, ptr %0, align 8, !alias.scope !7
+  ret void
+}
+
 !0 = !{}
 !1 = !{}
+!2 = !{!2}
+!3 = !{!3, !2}
+!4 = !{!3}
+!5 = !{!5}
+!6 = !{!6, !5}
+!7 = !{!6}

From 4c5dee7773dcc5ec1b7485dfba5ce5baa2355b2f Mon Sep 17 00:00:00 2001
From: Renaud-K <rkauffmann@nvidia.com>
Date: Tue, 21 Mar 2023 16:32:26 -0700
Subject: [PATCH 148/208] [flang] Lowering fir.dispatch in the polymorphic op
 pass Differential revision: https://reviews.llvm.org/D146594

---
 .../flang/Semantics/runtime-type-info.h       |   8 +
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 152 +-------------
 .../Transforms/PolymorphicOpConversion.cpp    | 155 ++++++++++++++
 flang/lib/Semantics/runtime-type-info.cpp     |   9 +-
 flang/test/Fir/dispatch.f90                   | 192 ++++++++++--------
 flang/test/Lower/allocatable-polymorphic.f90  |   8 +-
 6 files changed, 287 insertions(+), 237 deletions(-)

diff --git a/flang/include/flang/Semantics/runtime-type-info.h b/flang/include/flang/Semantics/runtime-type-info.h
index 76560b98b1c20..e27091cf32de0 100644
--- a/flang/include/flang/Semantics/runtime-type-info.h
+++ b/flang/include/flang/Semantics/runtime-type-info.h
@@ -42,6 +42,14 @@ RuntimeDerivedTypeTables BuildRuntimeDerivedTypeTables(SemanticsContext &);
 /// to describe other derived types at runtime in flang descriptor.
 constexpr char typeInfoBuiltinModule[]{"__fortran_type_info"};
 
+/// Name of the bindings descriptor component in the DerivedType type of the
+/// __Fortran_type_info module
+constexpr char bindingDescCompName[]{"binding"};
+
+/// Name of the __builtin_c_funptr component in the Binding type  of the
+/// __Fortran_type_info module
+constexpr char procCompName[]{"proc"};
+
 SymbolVector CollectBindings(const Scope &dtScope);
 
 } // namespace Fortran::semantics
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 6b4591789c545..8ea8fa7290372 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -104,10 +104,8 @@ template <typename FromOp>
 class FIROpConversion : public mlir::ConvertOpToLLVMPattern<FromOp> {
 public:
   explicit FIROpConversion(fir::LLVMTypeConverter &lowering,
-                           const fir::FIRToLLVMPassOptions &options,
-                           const fir::BindingTables &bindingTables)
-      : mlir::ConvertOpToLLVMPattern<FromOp>(lowering), options(options),
-        bindingTables(bindingTables) {}
+                           const fir::FIRToLLVMPassOptions &options)
+      : mlir::ConvertOpToLLVMPattern<FromOp>(lowering), options(options) {}
 
 protected:
   mlir::Type convertType(mlir::Type ty) const {
@@ -358,7 +356,6 @@ class FIROpConversion : public mlir::ConvertOpToLLVMPattern<FromOp> {
   }
 
   const fir::FIRToLLVMPassOptions &options;
-  const fir::BindingTables &bindingTables;
 };
 
 /// FIR conversion pattern template
@@ -970,131 +967,6 @@ struct ConvertOpConversion : public FIROpConversion<fir::ConvertOp> {
   }
 };
 
-/// Lower `fir.dispatch` operation. A virtual call to a method in a dispatch
-/// table.
-struct DispatchOpConversion : public FIROpConversion<fir::DispatchOp> {
-  using FIROpConversion::FIROpConversion;
-
-  mlir::LogicalResult
-  matchAndRewrite(fir::DispatchOp dispatch, OpAdaptor adaptor,
-                  mlir::ConversionPatternRewriter &rewriter) const override {
-    mlir::Location loc = dispatch.getLoc();
-
-    if (bindingTables.empty())
-      return emitError(loc) << "no binding tables found";
-
-    // Get derived type information.
-    mlir::Type declaredType =
-        fir::getDerivedType(dispatch.getObject().getType().getEleTy());
-    assert(declaredType.isa<fir::RecordType>() && "expecting fir.type");
-    auto recordType = declaredType.dyn_cast<fir::RecordType>();
-
-    // Lookup for the binding table.
-    auto bindingsIter = bindingTables.find(recordType.getName());
-    if (bindingsIter == bindingTables.end())
-      return emitError(loc)
-             << "cannot find binding table for " << recordType.getName();
-
-    // Lookup for the binding.
-    const fir::BindingTable &bindingTable = bindingsIter->second;
-    auto bindingIter = bindingTable.find(dispatch.getMethod());
-    if (bindingIter == bindingTable.end())
-      return emitError(loc)
-             << "cannot find binding for " << dispatch.getMethod();
-    unsigned bindingIdx = bindingIter->second;
-
-    mlir::Value passedObject = dispatch.getObject();
-
-    auto module = dispatch.getOperation()->getParentOfType<mlir::ModuleOp>();
-    mlir::Type typeDescTy;
-    std::string typeDescName =
-        fir::NameUniquer::getTypeDescriptorName(recordType.getName());
-    if (auto global = module.lookupSymbol<fir::GlobalOp>(typeDescName)) {
-      typeDescTy = convertType(global.getType());
-    } else if (auto global =
-                   module.lookupSymbol<mlir::LLVM::GlobalOp>(typeDescName)) {
-      // The global may have already been translated to LLVM.
-      typeDescTy = global.getType();
-    }
-
-    unsigned typeDescFieldId = getTypeDescFieldId(passedObject.getType());
-
-    auto descPtr = adaptor.getOperands()[0]
-                       .getType()
-                       .dyn_cast<mlir::LLVM::LLVMPointerType>();
-
-    // TODO: the following loads from the type descriptor related
-    // data structures must have proper TBAA access tags.
-    // These loads cannot alias with any real data accesses nor
-    // with any box accesses. Moreover, they can probably be marked
-    // as reading from constant memory (fourth operand of a TBAA
-    // tag may be set to true). These accesses probably deserve
-    // separate sub-root in the TBAA graph.
-
-    // Load the descriptor.
-    auto desc = rewriter.create<mlir::LLVM::LoadOp>(
-        loc, descPtr.getElementType(), adaptor.getOperands()[0]);
-
-    // Load the type descriptor.
-    auto typeDescPtr =
-        rewriter.create<mlir::LLVM::ExtractValueOp>(loc, desc, typeDescFieldId);
-    auto typeDesc =
-        rewriter.create<mlir::LLVM::LoadOp>(loc, typeDescTy, typeDescPtr);
-
-    // Load the bindings descriptor.
-    auto typeDescStructTy = typeDescTy.dyn_cast<mlir::LLVM::LLVMStructType>();
-    auto bindingDescType =
-        typeDescStructTy.getBody()[0].dyn_cast<mlir::LLVM::LLVMStructType>();
-    auto bindingDesc =
-        rewriter.create<mlir::LLVM::ExtractValueOp>(loc, typeDesc, 0);
-
-    // Load the correct binding.
-    auto bindingType =
-        bindingDescType.getBody()[0].dyn_cast<mlir::LLVM::LLVMPointerType>();
-    auto baseBindingPtr = rewriter.create<mlir::LLVM::ExtractValueOp>(
-        loc, bindingDesc, kAddrPosInBox);
-    auto bindingPtr = rewriter.create<mlir::LLVM::GEPOp>(
-        loc, bindingType, baseBindingPtr,
-        llvm::ArrayRef<mlir::LLVM::GEPArg>{static_cast<int32_t>(bindingIdx)});
-    auto binding = rewriter.create<mlir::LLVM::LoadOp>(
-        loc, bindingType.getElementType(), bindingPtr);
-
-    // Get the function type.
-    llvm::SmallVector<mlir::Type> argTypes;
-    for (mlir::Value operand : adaptor.getOperands().drop_front())
-      argTypes.push_back(operand.getType());
-    mlir::Type resultType;
-    if (dispatch.getResults().empty())
-      resultType = mlir::LLVM::LLVMVoidType::get(dispatch.getContext());
-    else
-      resultType = convertType(dispatch.getResults()[0].getType());
-    auto fctType = mlir::LLVM::LLVMFunctionType::get(resultType, argTypes,
-                                                     /*isVarArg=*/false);
-
-    // Get the function pointer.
-    auto builtinFuncPtr =
-        rewriter.create<mlir::LLVM::ExtractValueOp>(loc, binding, 0);
-    auto funcAddr =
-        rewriter.create<mlir::LLVM::ExtractValueOp>(loc, builtinFuncPtr, 0);
-    auto funcPtr = rewriter.create<mlir::LLVM::IntToPtrOp>(
-        loc, mlir::LLVM::LLVMPointerType::get(fctType), funcAddr);
-
-    // Indirect calls are done with the function pointer as the first operand.
-    llvm::SmallVector<mlir::Value> args;
-    args.push_back(funcPtr);
-    for (mlir::Value operand : adaptor.getOperands().drop_front())
-      args.push_back(operand);
-    auto callOp = rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
-        dispatch,
-        dispatch.getResults().empty() ? mlir::TypeRange{}
-                                      : fctType.getReturnType(),
-        "", args);
-    callOp.removeCalleeAttr(); // Indirect calls do not have callee attr.
-
-    return mlir::success();
-  }
-};
-
 /// `fir.disptach_table` operation has no specific CodeGen. The operation is
 /// only used to carry information during FIR to FIR passes.
 struct DispatchTableOpConversion
@@ -3656,9 +3528,8 @@ struct NegcOpConversion : public FIROpConversion<fir::NegcOp> {
 template <typename FromOp>
 struct MustBeDeadConversion : public FIROpConversion<FromOp> {
   explicit MustBeDeadConversion(fir::LLVMTypeConverter &lowering,
-                                const fir::FIRToLLVMPassOptions &options,
-                                const fir::BindingTables &bindingTables)
-      : FIROpConversion<FromOp>(lowering, options, bindingTables) {}
+                                const fir::FIRToLLVMPassOptions &options)
+      : FIROpConversion<FromOp>(lowering, options) {}
   using OpAdaptor = typename FromOp::Adaptor;
 
   mlir::LogicalResult
@@ -3768,9 +3639,6 @@ class FIRToLLVMLowering
     if (mlir::failed(runPipeline(mathConvertionPM, mod)))
       return signalPassFailure();
 
-    fir::BindingTables bindingTables;
-    fir::buildBindingTables(bindingTables, mod);
-
     auto *context = getModule().getContext();
     fir::LLVMTypeConverter typeConverter{getModule(),
                                          options.applyTBAA || applyTBAA};
@@ -3783,11 +3651,11 @@ class FIRToLLVMLowering
         BoxProcHostOpConversion, BoxRankOpConversion, BoxTypeCodeOpConversion,
         BoxTypeDescOpConversion, CallOpConversion, CmpcOpConversion,
         ConstcOpConversion, ConvertOpConversion, CoordinateOpConversion,
-        DispatchOpConversion, DispatchTableOpConversion, DTEntryOpConversion,
-        DivcOpConversion, EmboxOpConversion, EmboxCharOpConversion,
-        EmboxProcOpConversion, ExtractValueOpConversion, FieldIndexOpConversion,
-        FirEndOpConversion, FreeMemOpConversion, GlobalLenOpConversion,
-        GlobalOpConversion, HasValueOpConversion, InsertOnRangeOpConversion,
+        DispatchTableOpConversion, DTEntryOpConversion, DivcOpConversion,
+        EmboxOpConversion, EmboxCharOpConversion, EmboxProcOpConversion,
+        ExtractValueOpConversion, FieldIndexOpConversion, FirEndOpConversion,
+        FreeMemOpConversion, GlobalLenOpConversion, GlobalOpConversion,
+        HasValueOpConversion, InsertOnRangeOpConversion,
         InsertValueOpConversion, IsPresentOpConversion,
         LenParamIndexOpConversion, LoadOpConversion, MulcOpConversion,
         NegcOpConversion, NoReassocOpConversion, SelectCaseOpConversion,
@@ -3797,7 +3665,7 @@ class FIRToLLVMLowering
         SubcOpConversion, TypeDescOpConversion, UnboxCharOpConversion,
         UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion,
         XArrayCoorOpConversion, XEmboxOpConversion, XReboxOpConversion,
-        ZeroOpConversion>(typeConverter, options, bindingTables);
+        ZeroOpConversion>(typeConverter, options);
     mlir::populateFuncToLLVMConversionPatterns(typeConverter, pattern);
     mlir::populateOpenMPToLLVMConversionPatterns(typeConverter, pattern);
     mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, pattern);
diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
index f7ee2c19d45ac..2f8cdf7934436 100644
--- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
@@ -6,18 +6,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "flang/Lower/BuiltinModules.h"
+#include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Support/TypeCode.h"
+#include "flang/Optimizer/Support/Utils.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "flang/Runtime/derived-api.h"
+#include "flang/Semantics/runtime-type-info.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallSet.h"
@@ -72,6 +79,147 @@ class SelectTypeConv : public OpConversionPattern<fir::SelectTypeOp> {
   std::mutex *moduleMutex;
 };
 
+/// Lower `fir.dispatch` operation. A virtual call to a method in a dispatch
+/// table.
+struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
+  using OpConversionPattern<fir::DispatchOp>::OpConversionPattern;
+
+  DispatchOpConv(mlir::MLIRContext *ctx, const BindingTables &bindingTables)
+      : mlir::OpConversionPattern<fir::DispatchOp>(ctx),
+        bindingTables(bindingTables) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(fir::DispatchOp dispatch, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::Location loc = dispatch.getLoc();
+
+    if (bindingTables.empty())
+      return emitError(loc) << "no binding tables found";
+
+    // Get derived type information.
+    mlir::Type declaredType =
+        fir::getDerivedType(dispatch.getObject().getType().getEleTy());
+    assert(declaredType.isa<fir::RecordType>() && "expecting fir.type");
+    auto recordType = declaredType.dyn_cast<fir::RecordType>();
+
+    // Lookup for the binding table.
+    auto bindingsIter = bindingTables.find(recordType.getName());
+    if (bindingsIter == bindingTables.end())
+      return emitError(loc)
+             << "cannot find binding table for " << recordType.getName();
+
+    // Lookup for the binding.
+    const BindingTable &bindingTable = bindingsIter->second;
+    auto bindingIter = bindingTable.find(dispatch.getMethod());
+    if (bindingIter == bindingTable.end())
+      return emitError(loc)
+             << "cannot find binding for " << dispatch.getMethod();
+    unsigned bindingIdx = bindingIter->second;
+
+    mlir::Value passedObject = dispatch.getObject();
+
+    auto module = dispatch.getOperation()->getParentOfType<mlir::ModuleOp>();
+    Type typeDescTy;
+    std::string typeDescName =
+        NameUniquer::getTypeDescriptorName(recordType.getName());
+    if (auto global = module.lookupSymbol<fir::GlobalOp>(typeDescName)) {
+      typeDescTy = global.getType();
+    }
+
+    // clang-format off
+    // Before:
+    //   fir.dispatch "proc1"(%11 :
+    //   !fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>)
+
+    // After:
+    //   %12 = fir.box_tdesc %11 : (!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.tdesc<none>
+    //   %13 = fir.convert %12 : (!fir.tdesc<none>) -> !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype>>
+    //   %14 = fir.field_index binding, !fir.type<_QM__fortran_type_infoTderivedtype>
+    //   %15 = fir.coordinate_of %13, %14 : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype>>, !fir.field) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding>>>>>
+    //   %bindings = fir.load %15 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding>>>>>
+    //   %16 = fir.box_addr %bindings : (!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding>>>>) -> !fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding>>>
+    //   %17 = fir.coordinate_of %16, %c0 : (!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding>>>, index) -> !fir.ref<!fir.type<_QM__fortran_type_infoTbinding>>
+    //   %18 = fir.field_index proc, !fir.type<_QM__fortran_type_infoTbinding>
+    //   %19 = fir.coordinate_of %17, %18 : (!fir.ref<!fir.type<_QM__fortran_type_infoTbinding>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr>>
+    //   %20 = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr>
+    //   %21 = fir.coordinate_of %19, %20 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr>>, !fir.field) -> !fir.ref<i64> 
+    //   %22 = fir.load %21 : !fir.ref<i64>
+    //   %23 = fir.convert %22 : (i64) -> (() -> ())
+    //   fir.call %23()  : () -> ()
+    // clang-format on
+
+    // Load the descriptor.
+    mlir::Type fieldTy = fir::FieldType::get(rewriter.getContext());
+    mlir::Type tdescType =
+        fir::TypeDescType::get(mlir::NoneType::get(rewriter.getContext()));
+    mlir::Value boxDesc =
+        rewriter.create<fir::BoxTypeDescOp>(loc, tdescType, passedObject);
+    boxDesc = rewriter.create<fir::ConvertOp>(
+        loc, fir::ReferenceType::get(typeDescTy), boxDesc);
+
+    // Load the bindings descriptor.
+    auto bindingsCompName = Fortran::semantics::bindingDescCompName;
+    fir::RecordType typeDescRecTy = typeDescTy.cast<fir::RecordType>();
+    mlir::Value field = rewriter.create<fir::FieldIndexOp>(
+        loc, fieldTy, bindingsCompName, typeDescRecTy, mlir::ValueRange{});
+    mlir::Type coorTy =
+        fir::ReferenceType::get(typeDescRecTy.getType(bindingsCompName));
+    mlir::Value bindingBoxAddr =
+        rewriter.create<fir::CoordinateOp>(loc, coorTy, boxDesc, field);
+    mlir::Value bindingBox = rewriter.create<fir::LoadOp>(loc, bindingBoxAddr);
+
+    // Load the correct binding.
+    mlir::Value bindings = rewriter.create<fir::BoxAddrOp>(loc, bindingBox);
+    fir::RecordType bindingTy =
+        fir::unwrapIfDerived(bindingBox.getType().cast<fir::BaseBoxType>());
+    mlir::Type bindingAddrTy = fir::ReferenceType::get(bindingTy);
+    mlir::Value bindingIdxVal = rewriter.create<mlir::arith::ConstantOp>(
+        loc, rewriter.getIndexType(), rewriter.getIndexAttr(bindingIdx));
+    mlir::Value bindingAddr = rewriter.create<fir::CoordinateOp>(
+        loc, bindingAddrTy, bindings, bindingIdxVal);
+
+    // Get the function pointer.
+    auto procCompName = Fortran::semantics::procCompName;
+    mlir::Value procField = rewriter.create<fir::FieldIndexOp>(
+        loc, fieldTy, procCompName, bindingTy, mlir::ValueRange{});
+    fir::RecordType procTy =
+        bindingTy.getType(procCompName).cast<fir::RecordType>();
+    mlir::Type procRefTy = fir::ReferenceType::get(procTy);
+    mlir::Value procRef = rewriter.create<fir::CoordinateOp>(
+        loc, procRefTy, bindingAddr, procField);
+
+    auto addressFieldName = Fortran::lower::builtin::cptrFieldName;
+    mlir::Value addressField = rewriter.create<fir::FieldIndexOp>(
+        loc, fieldTy, addressFieldName, procTy, mlir::ValueRange{});
+    mlir::Type addressTy = procTy.getType(addressFieldName);
+    mlir::Type addressRefTy = fir::ReferenceType::get(addressTy);
+    mlir::Value addressRef = rewriter.create<fir::CoordinateOp>(
+        loc, addressRefTy, procRef, addressField);
+    mlir::Value address = rewriter.create<fir::LoadOp>(loc, addressRef);
+
+    // Get the function type.
+    llvm::SmallVector<mlir::Type> argTypes;
+    for (mlir::Value operand : dispatch.getArgs())
+      argTypes.push_back(operand.getType());
+    llvm::SmallVector<mlir::Type> resTypes;
+    if (!dispatch.getResults().empty())
+      resTypes.push_back(dispatch.getResults()[0].getType());
+
+    mlir::Type funTy =
+        mlir::FunctionType::get(rewriter.getContext(), argTypes, resTypes);
+    mlir::Value funcPtr = rewriter.create<fir::ConvertOp>(loc, funTy, address);
+
+    // Make the call.
+    llvm::SmallVector<mlir::Value> args{funcPtr};
+    args.append(dispatch.getArgs().begin(), dispatch.getArgs().end());
+    rewriter.replaceOpWithNewOp<fir::CallOp>(dispatch, resTypes, nullptr, args);
+    return mlir::success();
+  }
+
+private:
+  BindingTables bindingTables;
+};
+
 /// Convert FIR structured control flow ops to CFG ops.
 class PolymorphicOpConversion
     : public fir::impl::PolymorphicOpConversionBase<PolymorphicOpConversion> {
@@ -83,14 +231,21 @@ class PolymorphicOpConversion
 
   void runOnOperation() override {
     auto *context = &getContext();
+    auto mod = getOperation()->getParentOfType<ModuleOp>();
     mlir::RewritePatternSet patterns(context);
+
+    BindingTables bindingTables;
+    buildBindingTables(bindingTables, mod);
+
     patterns.insert<SelectTypeConv>(context, moduleMutex);
+    patterns.insert<DispatchOpConv>(context, bindingTables);
     mlir::ConversionTarget target(*context);
     target.addLegalDialect<mlir::AffineDialect, mlir::cf::ControlFlowDialect,
                            FIROpsDialect, mlir::func::FuncDialect>();
 
     // apply the patterns
     target.addIllegalOp<SelectTypeOp>();
+    target.addIllegalOp<DispatchOp>();
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns)))) {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 29f63524b5c07..5e57c70c42fbb 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -151,7 +151,8 @@ RuntimeTableBuilder::RuntimeTableBuilder(
     : context_{c}, tables_{t}, derivedTypeSchema_{GetSchema("derivedtype")},
       componentSchema_{GetSchema("component")}, procPtrSchema_{GetSchema(
                                                     "procptrcomponent")},
-      valueSchema_{GetSchema("value")}, bindingSchema_{GetSchema("binding")},
+      valueSchema_{GetSchema("value")}, bindingSchema_{GetSchema(
+                                            bindingDescCompName)},
       specialSchema_{GetSchema("specialbinding")}, deferredEnum_{GetEnumValue(
                                                        "deferred")},
       explicitEnum_{GetEnumValue("explicit")}, lenParameterEnum_{GetEnumValue(
@@ -562,7 +563,7 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) {
     if (!isAbstractType) {
       std::vector<evaluate::StructureConstructor> bindings{
           DescribeBindings(dtScope, scope)};
-      AddValue(dtValues, derivedTypeSchema_, "binding"s,
+      AddValue(dtValues, derivedTypeSchema_, bindingDescCompName,
           SaveDerivedPointerTarget(scope, SaveObjectName(".v."s + distinctName),
               std::move(bindings),
               evaluate::ConstantSubscripts{
@@ -982,7 +983,7 @@ RuntimeTableBuilder::DescribeBindings(const Scope &dtScope, Scope &scope) {
   std::vector<evaluate::StructureConstructor> result;
   for (const SymbolRef &ref : CollectBindings(dtScope)) {
     evaluate::StructureConstructorValues values;
-    AddValue(values, bindingSchema_, "proc"s,
+    AddValue(values, bindingSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{
             ref.get().get<ProcBindingDetails>().symbol()}});
     AddValue(values, bindingSchema_, "name"s,
@@ -1152,7 +1153,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         values, specialSchema_, "which"s, SomeExpr{std::move(which.value())});
     AddValue(values, specialSchema_, "isargdescriptorset"s,
         IntExpr<1>(isArgDescriptorSet));
-    AddValue(values, specialSchema_, "proc"s,
+    AddValue(values, specialSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{specific}});
     // index might already be present in the case of an override
     specials.emplace(*index,
diff --git a/flang/test/Fir/dispatch.f90 b/flang/test/Fir/dispatch.f90
index dcb52bed7d967..933c769d3e169 100644
--- a/flang/test/Fir/dispatch.f90
+++ b/flang/test/Fir/dispatch.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir %s -o - | tco | FileCheck %s
+! RUN: bbc -polymorphic-type -emit-fir %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
 ! RUN: bbc -polymorphic-type -emit-fir %s -o - | FileCheck %s --check-prefix=BT
 
 ! Tests codegen of fir.dispatch operation. This test is intentionally run from
@@ -182,105 +182,123 @@ program test_type_to_class
 end
 
 
-! CHECK-LABEL: define void @_QMdispatch1Pdisplay_class(
-! CHECK-SAME: ptr %[[CLASS:.*]])
+! CHECK-LABEL: func.func @_QMdispatch1Pdisplay_class(
+! CHECK-SAME: %[[ARG:.*]]: [[CLASS:!fir.class<.*>>]]
 
-! CHECK-DAG: %[[INT32:.*]] = alloca i32, i64 1
-! CHECK-DAG: %[[REAL:.*]] = alloca float, i64 1
-! CHECK-DAG: %[[I:.*]] = alloca i32, i64 1
+! CHECK-DAG: %[[INT32:.*]] = fir.alloca i32
+! CHECK-DAG: %[[REAL:.*]] = fir.alloca f32
+! CHECK-DAG: %[[I:.*]] = fir.alloca i32
 
 ! Check dynamic dispatch equal to `call p%display2()` with binding index = 2.
-! CHECK: %[[LOADED_CLASS:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS]]
-! CHECK: %[[TYPEDESCPTR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED_CLASS]], 7
-! CHECK: %[[LOADED_TYPEDESC:.*]] = load %_QM__fortran_type_infoTderivedtype, ptr %[[TYPEDESCPTR]]
-! CHECK: %[[DT:.*]] = extractvalue %_QM__fortran_type_infoTderivedtype %[[LOADED_TYPEDESC]], 0
-! CHECK: %[[BINDING_BASE_ADDR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[DT]], 0
-! CHECK: %[[BINDING_PTR:.*]] = getelementptr %_QM__fortran_type_infoTbinding, ptr %[[BINDING_BASE_ADDR]], i32 2
-! CHECK: %[[LOADED_BINDING:.*]] = load %_QM__fortran_type_infoTbinding, ptr %[[BINDING_PTR]]
-! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = extractvalue %_QM__fortran_type_infoTbinding %[[LOADED_BINDING]], 0
-! CHECK: %[[FUNC_ADDR:.*]] = extractvalue %_QM__fortran_builtinsT__builtin_c_funptr %[[BUILTIN_FUNC_PTR]], 0
-! CHECK: %[[FUNC_PTR:.*]] = inttoptr i64 %[[FUNC_ADDR]] to ptr
-! CHECK: call void %[[FUNC_PTR]](ptr %[[CLASS]])
+! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG]] : ([[CLASS]]) -> !fir.tdesc<none>
+! CHECK: %[[TYPEDESCPTR:.*]] = fir.convert %[[BOXDESC]] : (!fir.tdesc<none>) -> !fir.ref<[[TYPEINFO:!fir.type<_QM__fortran_type_infoTderivedtype{.*}>]]>
+! CHECK: %[[BINDING_FIELD:.*]] = fir.field_index binding, [[TYPEINFO]]
+! CHECK: %[[BINDING_BOX_ADDR:.*]] =  fir.coordinate_of %[[TYPEDESCPTR]], %[[BINDING_FIELD]] : (!fir.ref<[[TYPEINFO]]>, !fir.field) -> !fir.ref<[[BINDING_BOX_TYPE:.*]]>
+! CHECK: %[[BINDING_BOX:.*]] = fir.load %[[BINDING_BOX_ADDR]] : !fir.ref<[[BINDING_BOX_TYPE]]>
+! CHECK: %[[BINDING_BASE_ADDR:.*]] = fir.box_addr %[[BINDING_BOX]] : ([[BINDING_BOX_TYPE]]) -> !fir.ptr<[[BINDINGSINFO:.*]]>
+! CHECK: %[[BINDING_PTR:.*]] = fir.coordinate_of %[[BINDING_BASE_ADDR]], %c2 : (!fir.ptr<[[BINDINGSINFO]]>, index) -> !fir.ref<[[BINDINGINFO:.*]]>
+! CHECK: %[[PROC_FIELD:.*]] = fir.field_index proc, [[BINDINGINFO]]
+! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = fir.coordinate_of %[[BINDING_PTR]], %[[PROC_FIELD]] : ({{.*}}) -> !fir.ref<[[BUILTIN_FUNC_TYPE:.*]]>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, [[BUILTIN_FUNC_TYPE]]
+! CHECK: %[[FUNC_ADDR_PTR:.*]] = fir.coordinate_of %[[BUILTIN_FUNC_PTR]], %[[ADDRESS_FIELD]]
+! CHECK: %[[FUNC_ADDR:.*]] = fir.load %[[FUNC_ADDR_PTR]] : !fir.ref<i64>
+! CHECK: %[[FUNC_PTR:.*]] = fir.convert %[[FUNC_ADDR]] : (i64) -> (([[CLASS]]) -> ())
+! CHECK: fir.call %[[FUNC_PTR]](%[[ARG]]) : ([[CLASS]]) -> ()
 
 ! Check dynamic dispatch equal to `call p%display1()` with binding index = 1.
-! CHECK: %[[LOADED_CLASS:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS]]
-! CHECK: %[[TYPEDESCPTR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED_CLASS]], 7
-! CHECK: %[[LOADED_TYPEDESC:.*]] = load %_QM__fortran_type_infoTderivedtype, ptr %[[TYPEDESCPTR]]
-! CHECK: %[[DT:.*]] = extractvalue %_QM__fortran_type_infoTderivedtype %[[LOADED_TYPEDESC]], 0
-! CHECK: %[[BINDING_BASE_ADDR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[DT]], 0
-! CHECK: %[[BINDING_PTR:.*]] = getelementptr %_QM__fortran_type_infoTbinding, ptr %[[BINDING_BASE_ADDR]], i32 1
-! CHECK: %[[LOADED_BINDING:.*]] = load %_QM__fortran_type_infoTbinding, ptr %[[BINDING_PTR]]
-! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = extractvalue %_QM__fortran_type_infoTbinding %[[LOADED_BINDING]], 0
-! CHECK: %[[FUNC_ADDR:.*]] = extractvalue %_QM__fortran_builtinsT__builtin_c_funptr %[[BUILTIN_FUNC_PTR]], 0
-! CHECK: %[[FUNC_PTR:.*]] = inttoptr i64 %[[FUNC_ADDR]] to ptr
-! CHECK: call void %[[FUNC_PTR]](ptr %[[CLASS]])
+! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG]] : ([[CLASS]]) -> !fir.tdesc<none>
+! CHECK: %[[TYPEDESCPTR:.*]] = fir.convert %[[BOXDESC]] : (!fir.tdesc<none>) -> !fir.ref<[[TYPEINFO:!fir.type<_QM__fortran_type_infoTderivedtype{.*}>]]>
+! CHECK: %[[BINDING_FIELD:.*]] = fir.field_index binding, [[TYPEINFO]]
+! CHECK: %[[BINDING_BOX_ADDR:.*]] =  fir.coordinate_of %[[TYPEDESCPTR]], %[[BINDING_FIELD]] : (!fir.ref<[[TYPEINFO]]>, !fir.field) -> !fir.ref<[[BINDING_BOX_TYPE:.*]]>
+! CHECK: %[[BINDING_BOX:.*]] = fir.load %[[BINDING_BOX_ADDR]] : !fir.ref<[[BINDING_BOX_TYPE]]>
+! CHECK: %[[BINDING_BASE_ADDR:.*]] = fir.box_addr %[[BINDING_BOX]] : ([[BINDING_BOX_TYPE]]) -> !fir.ptr<[[BINDINGSINFO:.*]]>
+! CHECK: %[[BINDING_PTR:.*]] = fir.coordinate_of %[[BINDING_BASE_ADDR]], %c1 : (!fir.ptr<[[BINDINGSINFO]]>, index) -> !fir.ref<[[BINDINGINFO:.*]]>
+! CHECK: %[[PROC_FIELD:.*]] = fir.field_index proc, [[BINDINGINFO]]
+! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = fir.coordinate_of %[[BINDING_PTR]], %[[PROC_FIELD]] : ({{.*}}) -> !fir.ref<[[BUILTIN_FUNC_TYPE:.*]]>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, [[BUILTIN_FUNC_TYPE]]
+! CHECK: %[[FUNC_ADDR_PTR:.*]] = fir.coordinate_of %[[BUILTIN_FUNC_PTR]], %[[ADDRESS_FIELD]]
+! CHECK: %[[FUNC_ADDR:.*]] = fir.load %[[FUNC_ADDR_PTR]] : !fir.ref<i64>
+! CHECK: %[[FUNC_PTR:.*]] = fir.convert %[[FUNC_ADDR]] : (i64) -> (([[CLASS]]) -> ())
+! CHECK: fir.call %[[FUNC_PTR]](%[[ARG]]) : ([[CLASS]]) -> ()
 
 ! Check dynamic dispatch equal to `call p%aproc()` with binding index = 0.
-! CHECK: %[[LOADED_CLASS:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS]]
-! CHECK: %[[TYPEDESCPTR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED_CLASS]], 7
-! CHECK: %[[LOADED_TYPEDESC:.*]] = load %_QM__fortran_type_infoTderivedtype, ptr %[[TYPEDESCPTR]]
-! CHECK: %[[DT:.*]] = extractvalue %_QM__fortran_type_infoTderivedtype %[[LOADED_TYPEDESC]], 0
-! CHECK: %[[BINDING_BASE_ADDR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[DT]], 0
-! CHECK: %[[BINDING_PTR:.*]] = getelementptr %_QM__fortran_type_infoTbinding, ptr %[[BINDING_BASE_ADDR]], i32 0
-! CHECK: %[[LOADED_BINDING:.*]] = load %_QM__fortran_type_infoTbinding, ptr %[[BINDING_PTR]]
-! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = extractvalue %_QM__fortran_type_infoTbinding %[[LOADED_BINDING]], 0
-! CHECK: %[[FUNC_ADDR:.*]] = extractvalue %_QM__fortran_builtinsT__builtin_c_funptr %[[BUILTIN_FUNC_PTR]], 0
-! CHECK: %[[FUNC_PTR:.*]] = inttoptr i64 %[[FUNC_ADDR]] to ptr
-! CHECK: call void %[[FUNC_PTR]](ptr %[[CLASS]])
+! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG]] : ([[CLASS]]) -> !fir.tdesc<none>
+! CHECK: %[[TYPEDESCPTR:.*]] = fir.convert %[[BOXDESC]] : (!fir.tdesc<none>) -> !fir.ref<[[TYPEINFO:!fir.type<_QM__fortran_type_infoTderivedtype{.*}>]]>
+! CHECK: %[[BINDING_FIELD:.*]] = fir.field_index binding, [[TYPEINFO]]
+! CHECK: %[[BINDING_BOX_ADDR:.*]] =  fir.coordinate_of %[[TYPEDESCPTR]], %[[BINDING_FIELD]] : (!fir.ref<[[TYPEINFO]]>, !fir.field) -> !fir.ref<[[BINDING_BOX_TYPE:.*]]>
+! CHECK: %[[BINDING_BOX:.*]] = fir.load %[[BINDING_BOX_ADDR]] : !fir.ref<[[BINDING_BOX_TYPE]]>
+! CHECK: %[[BINDING_BASE_ADDR:.*]] = fir.box_addr %[[BINDING_BOX]] : ([[BINDING_BOX_TYPE]]) -> !fir.ptr<[[BINDINGSINFO:.*]]>
+! CHECK: %[[BINDING_PTR:.*]] = fir.coordinate_of %[[BINDING_BASE_ADDR]], %c0 : (!fir.ptr<[[BINDINGSINFO]]>, index) -> !fir.ref<[[BINDINGINFO:.*]]>
+! CHECK: %[[PROC_FIELD:.*]] = fir.field_index proc, [[BINDINGINFO]]
+! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = fir.coordinate_of %[[BINDING_PTR]], %[[PROC_FIELD]] : ({{.*}}) -> !fir.ref<[[BUILTIN_FUNC_TYPE:.*]]>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, [[BUILTIN_FUNC_TYPE]]
+! CHECK: %[[FUNC_ADDR_PTR:.*]] = fir.coordinate_of %[[BUILTIN_FUNC_PTR]], %[[ADDRESS_FIELD]]
+! CHECK: %[[FUNC_ADDR:.*]] = fir.load %[[FUNC_ADDR_PTR]] : !fir.ref<i64>
+! CHECK: %[[FUNC_PTR:.*]] = fir.convert %[[FUNC_ADDR]] : (i64) -> (([[CLASS]]) -> ())
+! CHECK: fir.call %[[FUNC_PTR]](%[[ARG]]) : ([[CLASS]]) -> ()
 
 ! Check dynamic dispatch of a function with result.
-! CHECK: %[[LOADED_CLASS:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS]]
-! CHECK: %[[TYPEDESCPTR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED_CLASS]], 7
-! CHECK: %[[LOADED_TYPEDESC:.*]] = load %_QM__fortran_type_infoTderivedtype, ptr %[[TYPEDESCPTR]]
-! CHECK: %[[DT:.*]] = extractvalue %_QM__fortran_type_infoTderivedtype %[[LOADED_TYPEDESC]], 0
-! CHECK: %[[BINDING_BASE_ADDR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[DT]], 0
-! CHECK: %[[BINDING_PTR:.*]] = getelementptr %_QM__fortran_type_infoTbinding, ptr %[[BINDING_BASE_ADDR]], i32 3
-! CHECK: %[[LOADED_BINDING:.*]] = load %_QM__fortran_type_infoTbinding, ptr %[[BINDING_PTR]]
-! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = extractvalue %_QM__fortran_type_infoTbinding %[[LOADED_BINDING]], 0
-! CHECK: %[[FUNC_ADDR:.*]] = extractvalue %_QM__fortran_builtinsT__builtin_c_funptr %[[BUILTIN_FUNC_PTR]], 0
-! CHECK: %[[FUNC_PTR:.*]] = inttoptr i64 %[[FUNC_ADDR]] to ptr
-! CHECK: %[[RET:.*]] = call i32 %[[FUNC_PTR]](ptr %[[CLASS]])
-! CHECK: store i32 %[[RET]], ptr %[[I]]
+! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG]] : ([[CLASS]]) -> !fir.tdesc<none>
+! CHECK: %[[TYPEDESCPTR:.*]] = fir.convert %[[BOXDESC]] : (!fir.tdesc<none>) -> !fir.ref<[[TYPEINFO:!fir.type<_QM__fortran_type_infoTderivedtype{.*}>]]>
+! CHECK: %[[BINDING_FIELD:.*]] = fir.field_index binding, [[TYPEINFO]]
+! CHECK: %[[BINDING_BOX_ADDR:.*]] =  fir.coordinate_of %[[TYPEDESCPTR]], %[[BINDING_FIELD]] : (!fir.ref<[[TYPEINFO]]>, !fir.field) -> !fir.ref<[[BINDING_BOX_TYPE:.*]]>
+! CHECK: %[[BINDING_BOX:.*]] = fir.load %[[BINDING_BOX_ADDR]] : !fir.ref<[[BINDING_BOX_TYPE]]>
+! CHECK: %[[BINDING_BASE_ADDR:.*]] = fir.box_addr %[[BINDING_BOX]] : ([[BINDING_BOX_TYPE]]) -> !fir.ptr<[[BINDINGSINFO:.*]]>
+! CHECK: %[[BINDING_PTR:.*]] = fir.coordinate_of %[[BINDING_BASE_ADDR]], %c3 : (!fir.ptr<[[BINDINGSINFO]]>, index) -> !fir.ref<[[BINDINGINFO:.*]]>
+! CHECK: %[[PROC_FIELD:.*]] = fir.field_index proc, [[BINDINGINFO]]
+! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = fir.coordinate_of %[[BINDING_PTR]], %[[PROC_FIELD]] : ({{.*}}) -> !fir.ref<[[BUILTIN_FUNC_TYPE:.*]]>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, [[BUILTIN_FUNC_TYPE]]
+! CHECK: %[[FUNC_ADDR_PTR:.*]] = fir.coordinate_of %[[BUILTIN_FUNC_PTR]], %[[ADDRESS_FIELD]]
+! CHECK: %[[FUNC_ADDR:.*]] = fir.load %[[FUNC_ADDR_PTR]] : !fir.ref<i64>
+! CHECK: %[[FUNC_PTR:.*]] = fir.convert %[[FUNC_ADDR]] : (i64) -> (([[CLASS]]) -> i32)
+! CHECK: %[[RES:.*]] = fir.call %[[FUNC_PTR]](%[[ARG]]) : ([[CLASS]]) -> i32
 
 ! Check dynamic dispatch of call with passed-object and additional argument
-! CHECK: store float 2.500000e+00, ptr %[[REAL]]
-! CHECK: %[[LOADED_CLASS:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS]]
-! CHECK: %[[TYPEDESCPTR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED_CLASS]], 7
-! CHECK: %[[LOADED_TYPEDESC:.*]] = load %_QM__fortran_type_infoTderivedtype, ptr %[[TYPEDESCPTR]]
-! CHECK: %[[DT:.*]] = extractvalue %_QM__fortran_type_infoTderivedtype %[[LOADED_TYPEDESC]], 0
-! CHECK: %[[BINDING_BASE_ADDR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[DT]], 0
-! CHECK: %[[BINDING_PTR:.*]] = getelementptr %_QM__fortran_type_infoTbinding, ptr %[[BINDING_BASE_ADDR]], i32 6
-! CHECK: %[[LOADED_BINDING:.*]] = load %_QM__fortran_type_infoTbinding, ptr %[[BINDING_PTR]]
-! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = extractvalue %_QM__fortran_type_infoTbinding %[[LOADED_BINDING]], 0
-! CHECK: %[[FUNC_ADDR:.*]] = extractvalue %_QM__fortran_builtinsT__builtin_c_funptr %[[BUILTIN_FUNC_PTR]], 0
-! CHECK: %[[FUNC_PTR:.*]] = inttoptr i64 %[[FUNC_ADDR]] to ptr
-! CHECK: call void %[[FUNC_PTR]](ptr %[[CLASS]], ptr %[[REAL]])
+! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG]] : ([[CLASS]]) -> !fir.tdesc<none>
+! CHECK: %[[TYPEDESCPTR:.*]] = fir.convert %[[BOXDESC]] : (!fir.tdesc<none>) -> !fir.ref<[[TYPEINFO:!fir.type<_QM__fortran_type_infoTderivedtype{.*}>]]>
+! CHECK: %[[BINDING_FIELD:.*]] = fir.field_index binding, [[TYPEINFO]]
+! CHECK: %[[BINDING_BOX_ADDR:.*]] =  fir.coordinate_of %[[TYPEDESCPTR]], %[[BINDING_FIELD]] : (!fir.ref<[[TYPEINFO]]>, !fir.field) -> !fir.ref<[[BINDING_BOX_TYPE:.*]]>
+! CHECK: %[[BINDING_BOX:.*]] = fir.load %[[BINDING_BOX_ADDR]] : !fir.ref<[[BINDING_BOX_TYPE]]>
+! CHECK: %[[BINDING_BASE_ADDR:.*]] = fir.box_addr %[[BINDING_BOX]] : ([[BINDING_BOX_TYPE]]) -> !fir.ptr<[[BINDINGSINFO:.*]]>
+! CHECK: %[[BINDING_PTR:.*]] = fir.coordinate_of %[[BINDING_BASE_ADDR]], %c6 : (!fir.ptr<[[BINDINGSINFO]]>, index) -> !fir.ref<[[BINDINGINFO:.*]]>
+! CHECK: %[[PROC_FIELD:.*]] = fir.field_index proc, [[BINDINGINFO]]
+! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = fir.coordinate_of %[[BINDING_PTR]], %[[PROC_FIELD]] : ({{.*}}) -> !fir.ref<[[BUILTIN_FUNC_TYPE:.*]]>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, [[BUILTIN_FUNC_TYPE]]
+! CHECK: %[[FUNC_ADDR_PTR:.*]] = fir.coordinate_of %[[BUILTIN_FUNC_PTR]], %[[ADDRESS_FIELD]]
+! CHECK: %[[FUNC_ADDR:.*]] = fir.load %[[FUNC_ADDR_PTR]] : !fir.ref<i64>
+! CHECK: %[[FUNC_PTR:.*]] = fir.convert %[[FUNC_ADDR]] : (i64) -> (([[CLASS]], !fir.ref<f32>) -> ())
+! CHECK: fir.call %[[FUNC_PTR]](%[[ARG]], %[[REAL]]) : ([[CLASS]], !fir.ref<f32>) -> ()
 
 ! Check dynamic dispatch of a call with NOPASS
-! CHECK: %[[LOADED_CLASS:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS]]
-! CHECK: %[[TYPEDESCPTR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED_CLASS]], 7
-! CHECK: %[[LOADED_TYPEDESC:.*]] = load %_QM__fortran_type_infoTderivedtype, ptr %[[TYPEDESCPTR]]
-! CHECK: %[[DT:.*]] = extractvalue %_QM__fortran_type_infoTderivedtype %[[LOADED_TYPEDESC]], 0
-! CHECK: %[[BINDING_BASE_ADDR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[DT]], 0
-! CHECK: %[[BINDING_PTR:.*]] = getelementptr %_QM__fortran_type_infoTbinding, ptr %[[BINDING_BASE_ADDR]], i32 4
-! CHECK: %[[LOADED_BINDING:.*]] = load %_QM__fortran_type_infoTbinding, ptr %[[BINDING_PTR]]
-! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = extractvalue %_QM__fortran_type_infoTbinding %[[LOADED_BINDING]], 0
-! CHECK: %[[FUNC_ADDR:.*]] = extractvalue %_QM__fortran_builtinsT__builtin_c_funptr %[[BUILTIN_FUNC_PTR]], 0
-! CHECK: %[[FUNC_PTR:.*]] = inttoptr i64 %[[FUNC_ADDR]] to ptr
-! CHECK: call void %[[FUNC_PTR]]()
-
-! CHECK: store i32 1, ptr %[[INT32]]
-! CHECK: %[[LOADED_CLASS:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS]]
-! CHECK: %[[TYPEDESCPTR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED_CLASS]], 7
-! CHECK: %[[LOADED_TYPEDESC:.*]] = load %_QM__fortran_type_infoTderivedtype, ptr %[[TYPEDESCPTR]]
-! CHECK: %[[DT:.*]] = extractvalue %_QM__fortran_type_infoTderivedtype %[[LOADED_TYPEDESC]], 0
-! CHECK: %[[BINDING_BASE_ADDR:.*]] = extractvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[DT]], 0
-! CHECK: %[[BINDING_PTR:.*]] = getelementptr %_QM__fortran_type_infoTbinding, ptr %[[BINDING_BASE_ADDR]], i32 5
-! CHECK: %[[LOADED_BINDING:.*]] = load %_QM__fortran_type_infoTbinding, ptr %[[BINDING_PTR]]
-! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = extractvalue %_QM__fortran_type_infoTbinding %[[LOADED_BINDING]], 0
-! CHECK: %[[FUNC_ADDR:.*]] = extractvalue %_QM__fortran_builtinsT__builtin_c_funptr %[[BUILTIN_FUNC_PTR]], 0
-! CHECK: %[[FUNC_PTR:.*]] = inttoptr i64 %[[FUNC_ADDR]] to ptr
-! CHECK: call void %[[FUNC_PTR]](ptr %[[INT32]], ptr %[[CLASS]])
+! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG]] : ([[CLASS]]) -> !fir.tdesc<none>
+! CHECK: %[[TYPEDESCPTR:.*]] = fir.convert %[[BOXDESC]] : (!fir.tdesc<none>) -> !fir.ref<[[TYPEINFO:!fir.type<_QM__fortran_type_infoTderivedtype{.*}>]]>
+! CHECK: %[[BINDING_FIELD:.*]] = fir.field_index binding, [[TYPEINFO]]
+! CHECK: %[[BINDING_BOX_ADDR:.*]] =  fir.coordinate_of %[[TYPEDESCPTR]], %[[BINDING_FIELD]] : (!fir.ref<[[TYPEINFO]]>, !fir.field) -> !fir.ref<[[BINDING_BOX_TYPE:.*]]>
+! CHECK: %[[BINDING_BOX:.*]] = fir.load %[[BINDING_BOX_ADDR]] : !fir.ref<[[BINDING_BOX_TYPE]]>
+! CHECK: %[[BINDING_BASE_ADDR:.*]] = fir.box_addr %[[BINDING_BOX]] : ([[BINDING_BOX_TYPE]]) -> !fir.ptr<[[BINDINGSINFO:.*]]>
+! CHECK: %[[BINDING_PTR:.*]] = fir.coordinate_of %[[BINDING_BASE_ADDR]], %c4 : (!fir.ptr<[[BINDINGSINFO]]>, index) -> !fir.ref<[[BINDINGINFO:.*]]>
+! CHECK: %[[PROC_FIELD:.*]] = fir.field_index proc, [[BINDINGINFO]]
+! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = fir.coordinate_of %[[BINDING_PTR]], %[[PROC_FIELD]] : ({{.*}}) -> !fir.ref<[[BUILTIN_FUNC_TYPE:.*]]>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, [[BUILTIN_FUNC_TYPE]]
+! CHECK: %[[FUNC_ADDR_PTR:.*]] = fir.coordinate_of %[[BUILTIN_FUNC_PTR]], %[[ADDRESS_FIELD]]
+! CHECK: %[[FUNC_ADDR:.*]] = fir.load %[[FUNC_ADDR_PTR]] : !fir.ref<i64>
+! CHECK: %[[FUNC_PTR:.*]] = fir.convert %[[FUNC_ADDR]] : (i64) -> (() -> ())
+! CHECK: fir.call %[[FUNC_PTR]]() : () -> ()
+
+! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG]] : ([[CLASS]]) -> !fir.tdesc<none>
+! CHECK: %[[TYPEDESCPTR:.*]] = fir.convert %[[BOXDESC]] : (!fir.tdesc<none>) -> !fir.ref<[[TYPEINFO:!fir.type<_QM__fortran_type_infoTderivedtype{.*}>]]>
+! CHECK: %[[BINDING_FIELD:.*]] = fir.field_index binding, [[TYPEINFO]]
+! CHECK: %[[BINDING_BOX_ADDR:.*]] =  fir.coordinate_of %[[TYPEDESCPTR]], %[[BINDING_FIELD]] : (!fir.ref<[[TYPEINFO]]>, !fir.field) -> !fir.ref<[[BINDING_BOX_TYPE:.*]]>
+! CHECK: %[[BINDING_BOX:.*]] = fir.load %[[BINDING_BOX_ADDR]] : !fir.ref<[[BINDING_BOX_TYPE]]>
+! CHECK: %[[BINDING_BASE_ADDR:.*]] = fir.box_addr %[[BINDING_BOX]] : ([[BINDING_BOX_TYPE]]) -> !fir.ptr<[[BINDINGSINFO:.*]]>
+! CHECK: %[[BINDING_PTR:.*]] = fir.coordinate_of %[[BINDING_BASE_ADDR]], %c5 : (!fir.ptr<[[BINDINGSINFO]]>, index) -> !fir.ref<[[BINDINGINFO:.*]]>
+! CHECK: %[[PROC_FIELD:.*]] = fir.field_index proc, [[BINDINGINFO]]
+! CHECK: %[[BUILTIN_FUNC_PTR:.*]] = fir.coordinate_of %[[BINDING_PTR]], %[[PROC_FIELD]] : ({{.*}}) -> !fir.ref<[[BUILTIN_FUNC_TYPE:.*]]>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, [[BUILTIN_FUNC_TYPE]]
+! CHECK: %[[FUNC_ADDR_PTR:.*]] = fir.coordinate_of %[[BUILTIN_FUNC_PTR]], %[[ADDRESS_FIELD]]
+! CHECK: %[[FUNC_ADDR:.*]] = fir.load %[[FUNC_ADDR_PTR]] : !fir.ref<i64>
+! CHECK: %[[FUNC_PTR:.*]] = fir.convert %[[FUNC_ADDR]] : (i64) -> ((!fir.ref<i32>, [[CLASS]]) -> ())
+! CHECK: fir.call %[[FUNC_PTR]](%[[INT32]], %[[ARG]]) : (!fir.ref<i32>, [[CLASS]]) -> ()
 
 ! CHECK-LABEL: _QMdispatch1Pno_pass_array
 ! CHECK-LABEL: _QMdispatch1Pno_pass_array_allocatable
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index b129c7c8cdbe1..c3c01a39b8606 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -586,9 +586,9 @@ program test_alloc
 ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[C1_LOAD]], ptr %{{.*}}
 ! LLVM: %[[GEP_TDESC_C1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7
 ! LLVM: %[[TDESC_C1:.*]] = load ptr, ptr %[[GEP_TDESC_C1]]
-! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.}}, i32 0, i32 1
+! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1
 ! LLVM: %[[ELEM_SIZE:.*]] = load i64, ptr %[[ELEM_SIZE_GEP]]
-! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.}}, i32 0, i32 4
+! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
 ! LLVM: %[[TYPE_CODE:.*]] = load i32, ptr %[[TYPE_CODE_GEP]]
 ! LLVM: %{{.*}} = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } undef, i64 %[[ELEM_SIZE]], 1
 ! LLVM: %[[TRUNC_TYPE_CODE:.*]] = trunc i32 %[[TYPE_CODE]] to i8 
@@ -600,9 +600,9 @@ program test_alloc
 ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD_C2]], ptr %{{.*}}
 ! LLVM: %[[GEP_TDESC_C2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7
 ! LLVM: %[[TDESC_C2:.*]] = load ptr, ptr %[[GEP_TDESC_C2]]
-! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.}}, i32 0, i32 1
+! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1
 ! LLVM: %[[ELEM_SIZE:.*]] = load i64, ptr %[[ELEM_SIZE_GEP]]
-! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.}}, i32 0, i32 4
+! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
 ! LLVM: %[[TYPE_CODE:.*]] = load i32, ptr %[[TYPE_CODE_GEP]]
 ! LLVM: %{{.*}} = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } undef, i64 %[[ELEM_SIZE]], 1
 ! LLVM: %[[TRUNC_TYPE_CODE:.*]] = trunc i32 %[[TYPE_CODE]] to i8 

From 7739be7c6b6d017bf6b4445c5010e59314655995 Mon Sep 17 00:00:00 2001
From: Jeff Byrnes <jeffrey.byrnes@amd.com>
Date: Wed, 15 Mar 2023 12:11:20 -0700
Subject: [PATCH 149/208] [ArgPromotion] Remove dead code produced by removing
 dead arguments

ArgPromotion currently produces phantom / dead loads. A good example of this is store-into-inself.ll. First, ArgPromo finds the promotable argument %p in @l. Then it inserts a load of %p in the caller, and passes instead the loaded value / transforms the function body. PromoteMem2Reg is able to optimize out the entire function body, resulting in an unused argument. In a subsequent ArgPromotion pass, it removes the dead argument, resulting in a dead load in the caller. These dead loads may reduce effectiveness of other transformations (e.g. SimplifyCFG, MergedLoadStoreMotion).

This patch removes loads and geps that are made dead in the caller after removal of dead args.

Differential Revision: https://reviews.llvm.org/D146327
---
 llvm/lib/Transforms/IPO/ArgumentPromotion.cpp |  8 ++++++
 .../propagate-remove-dead-args.ll             | 11 ++------
 .../ArgumentPromotion/store-into-inself.ll    |  1 -
 .../dce-after-argument-promotion-loads.ll     | 25 +++++++------------
 4 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index dd1a3b78a378c..3b1a174f5cc63 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -67,6 +67,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
@@ -220,6 +221,8 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
   // pass in the loaded pointers.
   SmallVector<Value *, 16> Args;
   const DataLayout &DL = F->getParent()->getDataLayout();
+  SmallVector<WeakTrackingVH, 16> DeadArgs;
+
   while (!F->use_empty()) {
     CallBase &CB = cast<CallBase>(*F->user_back());
     assert(CB.getCalledFunction() == F);
@@ -255,6 +258,9 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
           Args.push_back(LI);
           ArgAttrVec.push_back(AttributeSet());
         }
+      } else {
+        assert(ArgsToPromote.count(&*I) && I->use_empty());
+        DeadArgs.emplace_back(AI->get());
       }
     }
 
@@ -297,6 +303,8 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
     CB.eraseFromParent();
   }
 
+  RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadArgs);
+
   // Since we have now created the new function, splice the body of the old
   // function right into the new function, leaving the old rotting hulk of the
   // function empty.
diff --git a/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll b/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll
index cc1f7fb26a479..4176a8a7bc5c8 100644
--- a/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll
@@ -18,18 +18,12 @@ entry:
 
 define internal void @parent(ptr %this, ptr %p1, ptr %p2) {
 ; CHECK-LABEL: define internal void @parent
-; CHECK-SAME: (ptr [[THIS:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) {
+; CHECK-SAME: (ptr [[P1:%.*]], ptr [[P2:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SRC_ELEMENT_OP_0:%.*]] = getelementptr ptr, ptr [[THIS]], i64 0
-; CHECK-NEXT:    [[LOAD0:%.*]] = load ptr, ptr [[SRC_ELEMENT_OP_0]], align 8
 ; CHECK-NEXT:    [[P2_VAL2:%.*]] = load half, ptr [[P2]], align 2
 ; CHECK-NEXT:    call void @child(ptr [[P1]], half [[P2_VAL2]])
-; CHECK-NEXT:    [[SRC_ELEMENT_OP_1:%.*]] = getelementptr ptr, ptr [[THIS]], i64 1
-; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[SRC_ELEMENT_OP_1]], align 8
 ; CHECK-NEXT:    [[P2_VAL1:%.*]] = load half, ptr [[P2]], align 2
 ; CHECK-NEXT:    call void @child(ptr [[P1]], half [[P2_VAL1]])
-; CHECK-NEXT:    [[SRC_ELEMENT_OP_2:%.*]] = getelementptr ptr, ptr [[THIS]], i64 2
-; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[SRC_ELEMENT_OP_2]], align 8
 ; CHECK-NEXT:    [[P2_VAL:%.*]] = load half, ptr [[P2]], align 2
 ; CHECK-NEXT:    call void @child(ptr [[P1]], half [[P2_VAL]])
 ; CHECK-NEXT:    ret void
@@ -50,10 +44,9 @@ entry:
 define  void @grandparent() {
 ; CHECK-LABEL: define void @grandparent() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[F:%.*]] = alloca [[PTR_STRUCT:%.*]], align 8
 ; CHECK-NEXT:    [[XPTR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[YPTR:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @parent(ptr [[F]], ptr [[XPTR]], ptr [[YPTR]])
+; CHECK-NEXT:    call void @parent(ptr [[XPTR]], ptr [[YPTR]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll b/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll
index 7d7099003dc77..be94af6a0bd03 100644
--- a/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll
@@ -83,7 +83,6 @@ define i32 @main() nounwind  {
 ; CHECK-NEXT:    call void @g(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @h(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @k(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]]
-; CHECK-NEXT:    [[S_VAL:%.*]] = load ptr, ptr [[S]], align 8
 ; CHECK-NEXT:    call void @l() #[[ATTR0]]
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll
index 2bdd42b3dd8ca..2fe8f39e423a5 100644
--- a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll
+++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -O3 -S < %s | FileCheck %s
 
 ; Arg promotion eliminates the struct argument, and eliminates dead arguments, but introduces and leaves dead loads of the eliminated dead arg in callers
@@ -13,24 +13,17 @@ entry:
 }
 
 define ptr @parent(ptr align 8 dereferenceable(72) %f, i16 %val1, i16 %val2, i32 %val3) align 2 {
-; CHECK-LABEL: define {{[^@]+}}@parent
+; CHECK-LABEL: define nonnull ptr @parent
 ; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[F]], i64 64
+; CHECK-NEXT:    [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[CMP_NOT_NOT_I:%.*]] = icmp eq i32 [[VAL3]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[F]], i64 0, i32 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-; CHECK-NEXT:    br i1 [[CMP_NOT_NOT_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I:%.*]]
-; CHECK:       if.then.i:
-; CHECK-NEXT:    store i16 [[VAL1]], ptr [[TMP1]], align 2
-; CHECK-NEXT:    [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; CHECK-NEXT:    br label [[BADCHILD_EXIT:%.*]]
-; CHECK:       if.else.i:
-; CHECK-NEXT:    [[ADD_PTR_I_I_I_I7_I:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; CHECK-NEXT:    store i16 [[VAL1]], ptr [[ADD_PTR_I_I_I_I7_I]], align 2
-; CHECK-NEXT:    br label [[BADCHILD_EXIT]]
-; CHECK:       badChild.exit:
-; CHECK-NEXT:    [[DOTSINK_I:%.*]] = phi ptr [ [[TMP1]], [[IF_ELSE_I]] ], [ [[ADD_PTR_I_I_I_I_I]], [[IF_THEN_I]] ]
-; CHECK-NEXT:    store i16 [[VAL2]], ptr [[DOTSINK_I]], align 2
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_NOT_NOT_I]], i16 [[VAL1]], i16 [[VAL2]]
+; CHECK-NEXT:    [[SPEC_SELECT2_I:%.*]] = select i1 [[CMP_NOT_NOT_I]], i16 [[VAL2]], i16 [[VAL1]]
+; CHECK-NEXT:    store i16 [[SPEC_SELECT_I]], ptr [[F_VAL]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[F_VAL]], i64 16
+; CHECK-NEXT:    store i16 [[SPEC_SELECT2_I]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    ret ptr [[F]]
 ;
 entry:

From de939c6cd80c1e88913f1ef12be11aea501eb89b Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Mon, 13 Mar 2023 16:43:05 -0700
Subject: [PATCH 150/208] [libc] enable printf using system FILE

The printf and fprintf implementations use our internal implementation
to improve performance when it's available, but this patch enables using
the public FILE API for overlay mode.

Reviewed By: sivachandra, lntue

Differential Revision: https://reviews.llvm.org/D146001
---
 libc/config/linux/x86_64/entrypoints.txt      |  4 +-
 libc/src/stdio/CMakeLists.txt                 | 35 +++++---
 libc/src/stdio/fprintf.cpp                    | 10 ++-
 libc/src/stdio/printf.cpp                     | 12 ++-
 libc/src/stdio/printf_core/CMakeLists.txt     | 16 ++--
 libc/src/stdio/printf_core/file_writer.cpp    | 54 -------------
 libc/src/stdio/printf_core/file_writer.h      | 79 ++++++++++++++++---
 .../stdio/printf_core/vfprintf_internal.cpp   | 32 --------
 .../src/stdio/printf_core/vfprintf_internal.h | 18 ++++-
 libc/test/src/stdio/CMakeLists.txt            | 17 +++-
 libc/test/src/stdio/fprintf_test.cpp          | 32 ++++++--
 11 files changed, 174 insertions(+), 135 deletions(-)
 delete mode 100644 libc/src/stdio/printf_core/file_writer.cpp
 delete mode 100644 libc/src/stdio/printf_core/vfprintf_internal.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index b3017338f8260..5c0b3103f5615 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -111,6 +111,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.remove
     libc.src.stdio.sprintf
     libc.src.stdio.snprintf
+    libc.src.stdio.fprintf
+    libc.src.stdio.printf
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
@@ -412,10 +414,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.fwrite_unlocked
-    libc.src.stdio.fprintf
     libc.src.stdio.getc
     libc.src.stdio.getc_unlocked
-    libc.src.stdio.printf
     libc.src.stdio.sscanf
     libc.src.stdio.scanf
     libc.src.stdio.fscanf
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index 5f8d17953f633..7ccbf9aa28c4c 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -480,29 +480,42 @@ add_entrypoint_object(
     libc.src.stdio.printf_core.writer
 )
 
+list(APPEND printf_deps 
+      libc.src.__support.arg_list 
+      libc.src.stdio.printf_core.vfprintf_internal
+)
+if(LLVM_LIBC_FULL_BUILD)
+ list(APPEND printf_deps  
+      libc.src.__support.File.file 
+      libc.src.__support.File.platform_file 
+  )
+else()
+ set(printf_copts "-DLIBC_COPT_PRINTF_USE_SYSTEM_FILE")
+endif()
+
 add_entrypoint_object(
-  fprintf
+  printf
   SRCS
-    fprintf.cpp
+    printf.cpp
   HDRS
-    fprintf.h
+    printf.h
   DEPENDS
-    libc.src.__support.arg_list
-    libc.src.stdio.printf_core.vfprintf_internal
+    ${printf_deps}
+  COMPILE_OPTIONS
+    ${printf_copts}
 )
 
-
 add_entrypoint_object(
-  printf
+  fprintf
   SRCS
-    printf.cpp
+    fprintf.cpp
   HDRS
-    printf.h
+    fprintf.h
   DEPENDS
-    libc.src.__support.File.file
-    libc.src.__support.File.platform_file
     libc.src.__support.arg_list
     libc.src.stdio.printf_core.vfprintf_internal
+  COMPILE_OPTIONS
+    ${printf_copts}
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdio/fprintf.cpp b/libc/src/stdio/fprintf.cpp
index 796d5b5c47095..da8fabf5ab542 100644
--- a/libc/src/stdio/fprintf.cpp
+++ b/libc/src/stdio/fprintf.cpp
@@ -13,9 +13,16 @@
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include <stdarg.h>
+#include <stdio.h>
 
 namespace __llvm_libc {
 
+#ifndef LIBC_COPT_PRINTF_USE_SYSTEM_FILE
+using FileT = __llvm_libc::File;
+#else  // defined(LIBC_COPT_PRINTF_USE_SYSTEM_FILE)
+using FileT = ::FILE;
+#endif // LIBC_COPT_PRINTF_USE_SYSTEM_FILE
+
 LLVM_LIBC_FUNCTION(int, fprintf,
                    (::FILE *__restrict stream, const char *__restrict format,
                     ...)) {
@@ -25,7 +32,8 @@ LLVM_LIBC_FUNCTION(int, fprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(stream, format, args);
+  int ret_val = printf_core::vfprintf_internal(
+      reinterpret_cast<FileT *>(stream), format, args);
   return ret_val;
 }
 
diff --git a/libc/src/stdio/printf.cpp b/libc/src/stdio/printf.cpp
index 8fd8b9cc57fad..ca6f61ed63033 100644
--- a/libc/src/stdio/printf.cpp
+++ b/libc/src/stdio/printf.cpp
@@ -8,11 +8,18 @@
 
 #include "src/stdio/printf.h"
 
-#include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include <stdarg.h>
+#include <stdio.h>
+
+#ifndef LIBC_COPT_PRINTF_USE_SYSTEM_FILE
+#include "src/__support/File/file.h"
+#define PRINTF_STDOUT __llvm_libc::stdout
+#else // LIBC_COPT_PRINTF_USE_SYSTEM_FILE
+#define PRINTF_STDOUT ::stdout
+#endif // LIBC_COPT_PRINTF_USE_SYSTEM_FILE
 
 namespace __llvm_libc {
 
@@ -23,8 +30,7 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(
-      reinterpret_cast<::FILE *>(__llvm_libc::stdout), format, args);
+  int ret_val = printf_core::vfprintf_internal(PRINTF_STDOUT, format, args);
   return ret_val;
 }
 
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 31db8ad3c524c..109399772b53d 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -116,35 +116,31 @@ add_object_library(
     libc.src.__support.arg_list
 )
 
-if(NOT (TARGET libc.src.__support.File.file))
-  # Not all platforms have a file implementation. If file is unvailable,
-  # then we must skip all file based printf sections.
+if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
+  # Not all platforms have a file implementation. If file is unvailable, and a 
+  # full build is requested, then we must skip all file based printf sections.
   return()
 endif()
 
-add_object_library(
+add_header_library(
   file_writer
-  SRCS
-    file_writer.cpp
   HDRS
     file_writer.h
   DEPENDS
+    libc.include.stdio
     libc.src.__support.File.file
     libc.src.__support.CPP.string_view
     libc.src.string.memory_utils.memset_implementation
     .core_structs
 )
 
-add_object_library(
+add_header_library(
   vfprintf_internal
-  SRCS
-    vfprintf_internal.cpp
   HDRS
     vfprintf_internal.h
   DEPENDS
     libc.include.stdio
     libc.src.__support.File.file
-    libc.src.__support.File.platform_file
     libc.src.__support.arg_list
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.file_writer
diff --git a/libc/src/stdio/printf_core/file_writer.cpp b/libc/src/stdio/printf_core/file_writer.cpp
deleted file mode 100644
index 0e07e1c1eb8a7..0000000000000
--- a/libc/src/stdio/printf_core/file_writer.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-//===-- FILE Writer implementation for printf -------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/stdio/printf_core/file_writer.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/File/file.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include <stddef.h>
-
-namespace __llvm_libc {
-namespace printf_core {
-
-int FileWriter::write(const char *__restrict to_write, size_t len) {
-  auto result = file->write_unlocked(to_write, len);
-  int written = result.value;
-  if (written != static_cast<int>(len) || result.has_error())
-    written = FILE_WRITE_ERROR;
-  if (file->error_unlocked())
-    written = FILE_STATUS_ERROR;
-  return written;
-}
-
-int FileWriter::write_str(void *raw_pointer, cpp::string_view new_string) {
-  FileWriter *file_writer = reinterpret_cast<FileWriter *>(raw_pointer);
-  return file_writer->write(new_string.data(), new_string.size());
-}
-
-int FileWriter::write_chars(void *raw_pointer, char new_char, size_t len) {
-  FileWriter *file_writer = reinterpret_cast<FileWriter *>(raw_pointer);
-  constexpr size_t BUFF_SIZE = 8;
-  char buff[BUFF_SIZE] = {new_char};
-  int result;
-  while (len > BUFF_SIZE) {
-    result = file_writer->write(buff, BUFF_SIZE);
-    if (result < 0)
-      return result;
-    len -= BUFF_SIZE;
-  }
-  return file_writer->write(buff, len);
-}
-
-// TODO(michaelrj): Move this to putc_unlocked once that is available.
-int FileWriter::write_char(void *raw_pointer, char new_char) {
-  FileWriter *file_writer = reinterpret_cast<FileWriter *>(raw_pointer);
-  return file_writer->write(&new_char, 1);
-}
-
-} // namespace printf_core
-} // namespace __llvm_libc
diff --git a/libc/src/stdio/printf_core/file_writer.h b/libc/src/stdio/printf_core/file_writer.h
index 6ba1428a160e2..0fd6d115ddd8b 100644
--- a/libc/src/stdio/printf_core/file_writer.h
+++ b/libc/src/stdio/printf_core/file_writer.h
@@ -11,6 +11,8 @@
 
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/File/file.h"
+#include "src/__support/macros/attributes.h" // For LIBC_INLINE
+#include "src/stdio/printf_core/core_structs.h"
 
 #include <stddef.h>
 #include <stdio.h>
@@ -18,26 +20,81 @@
 namespace __llvm_libc {
 namespace printf_core {
 
-class FileWriter {
-  __llvm_libc::File *file;
+template <typename file_t> class FileWriter {
+  file_t *file;
 
 public:
-  FileWriter(::FILE *init_file) {
-    file = reinterpret_cast<__llvm_libc::File *>(init_file);
-    file->lock();
-  }
+  LIBC_INLINE FileWriter(file_t *init_file);
 
-  ~FileWriter() { file->unlock(); }
+  LIBC_INLINE ~FileWriter();
 
-  int write(const char *__restrict to_write, size_t len);
+  LIBC_INLINE int write(const char *__restrict to_write, size_t len);
 
   // These write functions take a FileWriter as a void* in raw_pointer, and
   // call the appropriate write function on it.
-  static int write_str(void *raw_pointer, cpp::string_view new_string);
-  static int write_chars(void *raw_pointer, char new_char, size_t len);
-  static int write_char(void *raw_pointer, char new_char);
+  static int write_str(void *raw_pointer, cpp::string_view new_string) {
+    FileWriter *file_writer = reinterpret_cast<FileWriter *>(raw_pointer);
+    return file_writer->write(new_string.data(), new_string.size());
+  }
+  static int write_chars(void *raw_pointer, char new_char, size_t len) {
+    FileWriter *file_writer = reinterpret_cast<FileWriter *>(raw_pointer);
+    constexpr size_t BUFF_SIZE = 8;
+    char buff[BUFF_SIZE] = {new_char};
+    int result;
+    while (len > BUFF_SIZE) {
+      result = file_writer->write(buff, BUFF_SIZE);
+      if (result < 0)
+        return result;
+      len -= BUFF_SIZE;
+    }
+    return file_writer->write(buff, len);
+  }
+  static int write_char(void *raw_pointer, char new_char) {
+    FileWriter *file_writer = reinterpret_cast<FileWriter *>(raw_pointer);
+    return file_writer->write(&new_char, 1);
+  }
 };
 
+// The interface for using our internal file implementation.
+template <>
+LIBC_INLINE
+FileWriter<__llvm_libc::File>::FileWriter(__llvm_libc::File *init_file) {
+  file = init_file;
+  file->lock();
+}
+template <> LIBC_INLINE FileWriter<__llvm_libc::File>::~FileWriter() {
+  file->unlock();
+}
+template <>
+LIBC_INLINE int
+FileWriter<__llvm_libc::File>::write(const char *__restrict to_write,
+                                     size_t len) {
+  auto result = file->write_unlocked(to_write, len);
+  size_t written = result.value;
+  if (written != len || result.has_error())
+    written = FILE_WRITE_ERROR;
+  if (file->error_unlocked())
+    written = FILE_STATUS_ERROR;
+  return written;
+}
+
+// The interface for using the system's file implementation.
+template <> LIBC_INLINE FileWriter<::FILE>::FileWriter(::FILE *init_file) {
+  file = init_file;
+  ::flockfile(file);
+}
+template <> LIBC_INLINE FileWriter<::FILE>::~FileWriter() {
+  ::funlockfile(file);
+}
+template <>
+LIBC_INLINE int FileWriter<::FILE>::write(const char *__restrict to_write,
+                                          size_t len) {
+  size_t written = ::fwrite_unlocked(to_write, 1, len, file);
+  if (written != len || ::ferror_unlocked(file))
+    written = FILE_WRITE_ERROR;
+  return written;
+}
+
 } // namespace printf_core
 } // namespace __llvm_libc
 
diff --git a/libc/src/stdio/printf_core/vfprintf_internal.cpp b/libc/src/stdio/printf_core/vfprintf_internal.cpp
deleted file mode 100644
index b25d545e54a11..0000000000000
--- a/libc/src/stdio/printf_core/vfprintf_internal.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- Internal implementation of vfprintf ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/stdio/printf_core/vfprintf_internal.h"
-
-#include "src/__support/arg_list.h"
-#include "src/stdio/printf_core/file_writer.h"
-#include "src/stdio/printf_core/printf_main.h"
-#include "src/stdio/printf_core/writer.h"
-
-#include <stdio.h>
-
-namespace __llvm_libc {
-namespace printf_core {
-
-int vfprintf_internal(::FILE *__restrict stream, const char *__restrict format,
-                      internal::ArgList &args) {
-  FileWriter file_writer(stream);
-  printf_core::Writer writer(reinterpret_cast<void *>(&file_writer),
-                             printf_core::FileWriter::write_str,
-                             printf_core::FileWriter::write_chars,
-                             printf_core::FileWriter::write_char);
-  return printf_core::printf_main(&writer, format, args);
-}
-
-} // namespace printf_core
-} // namespace __llvm_libc
diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h
index b837ebba182b4..762018f0b04c4 100644
--- a/libc/src/stdio/printf_core/vfprintf_internal.h
+++ b/libc/src/stdio/printf_core/vfprintf_internal.h
@@ -9,15 +9,29 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_VFPRINTF_INTERNAL_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_VFPRINTF_INTERNAL_H
 
+#include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/macros/attributes.h" // For LIBC_INLINE
+#include "src/stdio/printf_core/file_writer.h"
+#include "src/stdio/printf_core/printf_main.h"
+#include "src/stdio/printf_core/writer.h"
 
 #include <stdio.h>
 
 namespace __llvm_libc {
 namespace printf_core {
 
-int vfprintf_internal(::FILE *__restrict stream, const char *__restrict format,
-                      internal::ArgList &args);
+template <typename file_t>
+LIBC_INLINE int vfprintf_internal(file_t *__restrict stream,
+                                  const char *__restrict format,
+                                  internal::ArgList &args) {
+  FileWriter<file_t> file_writer(stream);
+  Writer writer(reinterpret_cast<void *>(&file_writer),
+                FileWriter<file_t>::write_str, FileWriter<file_t>::write_chars,
+                FileWriter<file_t>::write_char);
+  return printf_main(&writer, format, args);
+}
+
 } // namespace printf_core
 } // namespace __llvm_libc
 
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 8747f18f9045b..a4b5a9be892f1 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -134,6 +134,8 @@ add_libc_unittest(
     libc.src.stdio.snprintf
 )
 
+# In fullbuild mode, fprintf's tests use the internal FILE for other functions.
+if(LLVM_LIBC_FULL_BUILD)
 add_libc_unittest(
   fprintf_test
   SUITE
@@ -147,7 +149,20 @@ add_libc_unittest(
     libc.src.stdio.fopen
     libc.src.stdio.fread
 )
-
+else()
+# Else in overlay mode they use the system's FILE.
+add_libc_unittest(
+  fprintf_test
+  SUITE
+    libc_stdio_unittests
+  SRCS
+    fprintf_test.cpp
+  DEPENDS
+    libc.src.stdio.fprintf
+  COMPILE_OPTIONS
+    -DLIBC_COPT_PRINTF_USE_SYSTEM_FILE
+)
+endif()
 
 add_libc_unittest(
   printf_test
diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 286c516fbcf96..20b3c0faed6f7 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -6,10 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LIBC_COPT_PRINTF_USE_SYSTEM_FILE
 #include "src/stdio/fclose.h"
 #include "src/stdio/ferror.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fread.h"
+#endif // LIBC_COPT_PRINTF_USE_SYSTEM_FILE
 
 #include "src/stdio/fprintf.h"
 
@@ -17,9 +19,23 @@
 
 #include <stdio.h>
 
+namespace printf_test {
+#ifndef LIBC_COPT_PRINTF_USE_SYSTEM_FILE
+using __llvm_libc::fclose;
+using __llvm_libc::ferror;
+using __llvm_libc::fopen;
+using __llvm_libc::fread;
+#else  // defined(LIBC_COPT_PRINTF_USE_SYSTEM_FILE)
+using ::fclose;
+using ::ferror;
+using ::fopen;
+using ::fread;
+#endif // LIBC_COPT_PRINTF_USE_SYSTEM_FILE
+} // namespace printf_test
+
 TEST(LlvmLibcFPrintfTest, WriteToFile) {
   constexpr char FILENAME[] = "testdata/fprintf_output.test";
-  ::FILE *file = __llvm_libc::fopen(FILENAME, "w");
+  ::FILE *file = printf_test::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
 
   int written;
@@ -37,31 +53,31 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) {
   written = __llvm_libc::fprintf(file, format_more, short_numbers);
   EXPECT_EQ(written, 14);
 
-  ASSERT_EQ(0, __llvm_libc::fclose(file));
+  ASSERT_EQ(0, printf_test::fclose(file));
 
-  file = __llvm_libc::fopen(FILENAME, "r");
+  file = printf_test::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
 
   char data[50];
-  ASSERT_EQ(__llvm_libc::fread(data, 1, sizeof(simple) - 1, file),
+  ASSERT_EQ(printf_test::fread(data, 1, sizeof(simple) - 1, file),
             sizeof(simple) - 1);
   data[sizeof(simple) - 1] = '\0';
   ASSERT_STREQ(data, simple);
-  ASSERT_EQ(__llvm_libc::fread(data, 1, sizeof(numbers) - 1, file),
+  ASSERT_EQ(printf_test::fread(data, 1, sizeof(numbers) - 1, file),
             sizeof(numbers) - 1);
   data[sizeof(numbers) - 1] = '\0';
   ASSERT_STREQ(data, numbers);
-  ASSERT_EQ(__llvm_libc::fread(
+  ASSERT_EQ(printf_test::fread(
                 data, 1, sizeof(format_more) + sizeof(short_numbers) - 4, file),
             sizeof(format_more) + sizeof(short_numbers) - 4);
   data[sizeof(format_more) + sizeof(short_numbers) - 4] = '\0';
   ASSERT_STREQ(data, "1234 and more\n");
 
-  ASSERT_EQ(__llvm_libc::ferror(file), 0);
+  ASSERT_EQ(printf_test::ferror(file), 0);
 
   written =
       __llvm_libc::fprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
 
-  ASSERT_EQ(__llvm_libc::fclose(file), 0);
+  ASSERT_EQ(printf_test::fclose(file), 0);
 }

From 7d11a592c5adc286bf1845c20b20965d5e999039 Mon Sep 17 00:00:00 2001
From: Alex Brachet <abrachet@google.com>
Date: Thu, 23 Mar 2023 17:07:19 +0000
Subject: [PATCH 151/208] [libc] Fix some math conversion warnings

Differential Revision: https://reviews.llvm.org/D146738
---
 libc/src/__support/FPUtil/ManipulationFunctions.h  |  2 +-
 .../__support/FPUtil/NearestIntegerOperations.h    |  6 +++---
 libc/src/math/generic/acosf.cpp                    |  8 ++++----
 libc/src/math/generic/acoshf.cpp                   |  3 ++-
 libc/src/math/generic/asinf.cpp                    |  4 ++--
 libc/src/math/generic/asinhf.cpp                   | 14 ++++++++------
 libc/src/math/generic/atanf.cpp                    |  4 ++--
 libc/src/math/generic/atanhf.cpp                   |  9 +++++----
 libc/src/math/generic/cosf.cpp                     |  4 ++--
 libc/src/math/generic/coshf.cpp                    |  2 +-
 libc/src/math/generic/exp10f.cpp                   |  4 ++--
 libc/src/math/generic/exp2f.cpp                    |  2 +-
 libc/src/math/generic/log10.cpp                    |  2 +-
 libc/src/math/generic/log1pf.cpp                   |  2 +-
 libc/src/math/generic/sincosf.cpp                  |  8 ++++----
 libc/src/math/generic/sinf.cpp                     |  6 +++---
 libc/src/math/generic/sinhf.cpp                    |  7 ++++---
 libc/src/math/generic/tanf.cpp                     |  7 ++++---
 libc/src/math/generic/tanhf.cpp                    |  8 ++++----
 19 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index 27d91c433ac5b..14055ab74dce0 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -112,7 +112,7 @@ LIBC_INLINE T logb(T x) {
   }
 
   NormalFloat<T> normal(bits);
-  return normal.exponent;
+  return static_cast<T>(normal.exponent);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
diff --git a/libc/src/__support/FPUtil/NearestIntegerOperations.h b/libc/src/__support/FPUtil/NearestIntegerOperations.h
index 8265ea1cbb3e5..06aa9484c3f70 100644
--- a/libc/src/__support/FPUtil/NearestIntegerOperations.h
+++ b/libc/src/__support/FPUtil/NearestIntegerOperations.h
@@ -261,9 +261,9 @@ LIBC_INLINE I rounded_float_to_signed_integer(F x) {
   }
 
   // For all other cases, if `x` can fit in the integer type `I`,
-  // we just return `x`. Implicit conversion will convert the
-  // floating point value to the exact integer value.
-  return x;
+  // we just return `x`. static_cast will convert the floating
+  // point value to the exact integer value.
+  return static_cast<I>(x);
 }
 
 } // namespace internal
diff --git a/libc/src/math/generic/acosf.cpp b/libc/src/math/generic/acosf.cpp
index 5835dfa617056..41152e06ff1f5 100644
--- a/libc/src/math/generic/acosf.cpp
+++ b/libc/src/math/generic/acosf.cpp
@@ -56,8 +56,8 @@ LLVM_LIBC_FUNCTION(float, acosf, (float x)) {
         return r.value();
 
       double xd = static_cast<double>(x);
-      return fputil::multiply_add(-0x1.5555555555555p-3 * xd, xd * xd,
-                                  M_MATH_PI_2 - xd);
+      return static_cast<float>(fputil::multiply_add(
+          -0x1.5555555555555p-3 * xd, xd * xd, M_MATH_PI_2 - xd));
     }
 
     // For |x| <= 0.5, we approximate acosf(x) by:
@@ -70,7 +70,7 @@ LLVM_LIBC_FUNCTION(float, acosf, (float x)) {
     double xsq = xd * xd;
     double x3 = xd * xsq;
     double r = asin_eval(xsq);
-    return fputil::multiply_add(-x3, r, M_MATH_PI_2 - xd);
+    return static_cast<float>(fputil::multiply_add(-x3, r, M_MATH_PI_2 - xd));
   }
 
   // |x| > 1, return NaNs.
@@ -111,7 +111,7 @@ LLVM_LIBC_FUNCTION(float, acosf, (float x)) {
 
   double r3 = asin_eval(u);
   double r = fputil::multiply_add(cv * u, r3, cv);
-  return x_sign ? M_MATH_PI - r : r;
+  return static_cast<float>(x_sign ? M_MATH_PI - r : r);
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/acoshf.cpp b/libc/src/math/generic/acoshf.cpp
index ac225fe5a808f..f8e5a90a4d056 100644
--- a/libc/src/math/generic/acoshf.cpp
+++ b/libc/src/math/generic/acoshf.cpp
@@ -68,7 +68,8 @@ LLVM_LIBC_FUNCTION(float, acoshf, (float x)) {
 
   double x_d = static_cast<double>(x);
   // acosh(x) = log(x + sqrt(x^2 - 1))
-  return log_eval(x_d + fputil::sqrt(fputil::multiply_add(x_d, x_d, -1.0)));
+  return static_cast<float>(
+      log_eval(x_d + fputil::sqrt(fputil::multiply_add(x_d, x_d, -1.0))));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp
index c24697cb14727..9b724d3296c84 100644
--- a/libc/src/math/generic/asinf.cpp
+++ b/libc/src/math/generic/asinf.cpp
@@ -99,7 +99,7 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
     double xsq = xd * xd;
     double x3 = xd * xsq;
     double r = asin_eval(xsq);
-    return fputil::multiply_add(x3, r, xd);
+    return static_cast<float>(fputil::multiply_add(x3, r, xd));
   }
 
   // |x| > 1, return NaNs.
@@ -149,7 +149,7 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
   double c3 = c1 * u;
 
   double r = asin_eval(u);
-  return fputil::multiply_add(c3, r, c2);
+  return static_cast<float>(fputil::multiply_add(c3, r, c2));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/asinhf.cpp b/libc/src/math/generic/asinhf.cpp
index 7063387313ab8..91ecf45667bfc 100644
--- a/libc/src/math/generic/asinhf.cpp
+++ b/libc/src/math/generic/asinhf.cpp
@@ -27,8 +27,9 @@ LLVM_LIBC_FUNCTION(float, asinhf, (float x)) {
   if (LIBC_UNLIKELY(x_abs <= 0x3e80'0000U)) {
     // |x| <= 2^-26
     if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
-      return LIBC_UNLIKELY(x_abs == 0) ? x
-                                       : (x - 0x1.5555555555555p-3 * x * x * x);
+      return static_cast<float>(LIBC_UNLIKELY(x_abs == 0)
+                                    ? x
+                                    : (x - 0x1.5555555555555p-3 * x * x * x));
     }
 
     double x_d = x;
@@ -40,7 +41,7 @@ LLVM_LIBC_FUNCTION(float, asinhf, (float x)) {
         x_sq, 0.0, -0x1.555555555551ep-3, 0x1.3333333325495p-4,
         -0x1.6db6db5a7622bp-5, 0x1.f1c70f82928c6p-6, -0x1.6e893934266b7p-6,
         0x1.1c0b41d3fbe78p-6, -0x1.c0f47810b3c4fp-7, 0x1.2c8602690143dp-7);
-    return fputil::multiply_add(x_d, p, x_d);
+    return static_cast<float>(fputil::multiply_add(x_d, p, x_d));
   }
 
   const double SIGN[2] = {1.0, -1.0};
@@ -97,9 +98,10 @@ LLVM_LIBC_FUNCTION(float, asinhf, (float x)) {
   }
 
   // asinh(x) = log(x + sqrt(x^2 + 1))
-  return x_sign *
-         log_eval(fputil::multiply_add(
-             x_d, x_sign, fputil::sqrt(fputil::multiply_add(x_d, x_d, 1.0))));
+  return static_cast<float>(
+      x_sign *
+      log_eval(fputil::multiply_add(
+          x_d, x_sign, fputil::sqrt(fputil::multiply_add(x_d, x_d, 1.0)))));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
index ff5d4507afa3d..ed7847adb15a1 100644
--- a/libc/src/math/generic/atanf.cpp
+++ b/libc/src/math/generic/atanf.cpp
@@ -22,7 +22,7 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
 
   if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
     if (xbits.is_inf())
-      return opt_barrier(sign ? -M_MATH_PI_2 : M_MATH_PI_2);
+      return static_cast<float>(opt_barrier(sign ? -M_MATH_PI_2 : M_MATH_PI_2));
     else
       return x;
   }
@@ -52,7 +52,7 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
     }
   }
 
-  return atan_eval(x);
+  return static_cast<float>(atan_eval(x));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/atanhf.cpp b/libc/src/math/generic/atanhf.cpp
index b0c92fa8de87d..0a4512f7622da 100644
--- a/libc/src/math/generic/atanhf.cpp
+++ b/libc/src/math/generic/atanhf.cpp
@@ -40,8 +40,9 @@ LLVM_LIBC_FUNCTION(float, atanhf, (float x)) {
   if (LIBC_UNLIKELY(x_abs <= 0x3dcc'0000U)) {
     // |x| <= 2^-26
     if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
-      return LIBC_UNLIKELY(x_abs == 0) ? x
-                                       : (x + 0x1.5555555555555p-2 * x * x * x);
+      return static_cast<float>(LIBC_UNLIKELY(x_abs == 0)
+                                    ? x
+                                    : (x + 0x1.5555555555555p-2 * x * x * x));
     }
 
     double xdbl = x;
@@ -50,10 +51,10 @@ LLVM_LIBC_FUNCTION(float, atanhf, (float x)) {
     double pe = fputil::polyeval(x2, 0.0, 0x1.5555555555555p-2,
                                  0x1.999999999999ap-3, 0x1.2492492492492p-3,
                                  0x1.c71c71c71c71cp-4, 0x1.745d1745d1746p-4);
-    return fputil::multiply_add(xdbl, pe, xdbl);
+    return static_cast<float>(fputil::multiply_add(xdbl, pe, xdbl));
   }
   double xdbl = x;
-  return 0.5 * log_eval((xdbl + 1.0) / (xdbl - 1.0));
+  return static_cast<float>(0.5 * log_eval((xdbl + 1.0) / (xdbl - 1.0)));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp
index ef94804bda60d..2e4ca3c4133ff 100644
--- a/libc/src/math/generic/cosf.cpp
+++ b/libc/src/math/generic/cosf.cpp
@@ -129,8 +129,8 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
 
   sincosf_eval(xd, x_abs, sin_k, cos_k, sin_y, cosm1_y);
 
-  return fputil::multiply_add(sin_y, -sin_k,
-                              fputil::multiply_add(cosm1_y, cos_k, cos_k));
+  return static_cast<float>(fputil::multiply_add(
+      sin_y, -sin_k, fputil::multiply_add(cosm1_y, cos_k, cos_k)));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/coshf.cpp b/libc/src/math/generic/coshf.cpp
index 1ce1bc300d46b..1cf789a10a8ca 100644
--- a/libc/src/math/generic/coshf.cpp
+++ b/libc/src/math/generic/coshf.cpp
@@ -47,7 +47,7 @@ LLVM_LIBC_FUNCTION(float, coshf, (float x)) {
   // but not too small inputs, such as |x| < 2^-2, or |x| < 2^-3.
 
   // cosh(x) = (e^x + e^(-x)) / 2.
-  return exp_pm_eval</*is_sinh*/ false>(x);
+  return static_cast<float>(exp_pm_eval</*is_sinh*/ false>(x));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/exp10f.cpp b/libc/src/math/generic/exp10f.cpp
index 06509a97fe032..9d07f2c5261ee 100644
--- a/libc/src/math/generic/exp10f.cpp
+++ b/libc/src/math/generic/exp10f.cpp
@@ -72,7 +72,7 @@ LLVM_LIBC_FUNCTION(float, exp10f, (float x)) {
       return fputil::multiply_add(x, 0x1.26bb1cp+1f, 1.0f);
     }
 
-    return Exp10Base::powb_lo(x);
+    return static_cast<float>(Exp10Base::powb_lo(x));
   }
 
   // Exceptional value.
@@ -129,7 +129,7 @@ LLVM_LIBC_FUNCTION(float, exp10f, (float x)) {
   // 10^x = 2^(mid + hi) * 10^lo
   //      ~ mh * (c0 + p * lo^2)
   //      = (mh * c0) + p * (mh * lo^2)
-  return multiply_add(p, lo2 * rr.mh, c0 * rr.mh);
+  return static_cast<float>(multiply_add(p, lo2 * rr.mh, c0 * rr.mh));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/exp2f.cpp b/libc/src/math/generic/exp2f.cpp
index 3c319d288e1c9..15f35d0da82f5 100644
--- a/libc/src/math/generic/exp2f.cpp
+++ b/libc/src/math/generic/exp2f.cpp
@@ -128,7 +128,7 @@ LLVM_LIBC_FUNCTION(float, exp2f, (float x)) {
   //     = 2^(hi + mid) * 2^lo
   //     ~ mh * (1 + lo * P(lo))
   //     = mh + (mh*lo) * P(lo)
-  return fputil::multiply_add(p, dx_sq * mh, c1 * mh);
+  return static_cast<float>(fputil::multiply_add(p, dx_sq * mh, c1 * mh));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index b2dd29f5f74d1..47569b4758a4b 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -978,7 +978,7 @@ LLVM_LIBC_FUNCTION(double, log10, (double x)) {
   //   |R * x_m - 1| < C
   uint64_t x_u = xbits.uintval();
   int shifted = x_u >> 45;
-  size_t index = shifted & 0x7F;
+  int index = shifted & 0x7F;
   double r = R[index];
 
   x_e += (x_u >> 52) & 0x7FF;
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index bf6a91a5fc466..a7ca54887d59a 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -150,7 +150,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) {
   double c2 = fputil::multiply_add(xd, COEFFS[5], COEFFS[4]);
   double r = fputil::polyeval(xsq, xd, c0, c1, c2, COEFFS[6]);
 
-  return r;
+  return static_cast<float>(r);
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/sincosf.cpp b/libc/src/math/generic/sincosf.cpp
index 8448945a71d5d..277126bdc89db 100644
--- a/libc/src/math/generic/sincosf.cpp
+++ b/libc/src/math/generic/sincosf.cpp
@@ -193,10 +193,10 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
 
   sincosf_eval(xd, x_abs, sin_k, cos_k, sin_y, cosm1_y);
 
-  *sinp = fputil::multiply_add(sin_y, cos_k,
-                               fputil::multiply_add(cosm1_y, sin_k, sin_k));
-  *cosp = fputil::multiply_add(sin_y, -sin_k,
-                               fputil::multiply_add(cosm1_y, cos_k, cos_k));
+  *sinp = static_cast<float>(fputil::multiply_add(
+      sin_y, cos_k, fputil::multiply_add(cosm1_y, sin_k, sin_k)));
+  *cosp = static_cast<float>(fputil::multiply_add(
+      sin_y, -sin_k, fputil::multiply_add(cosm1_y, cos_k, cos_k)));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp
index 1641c44e9fc00..697c438c2c67b 100644
--- a/libc/src/math/generic/sinf.cpp
+++ b/libc/src/math/generic/sinf.cpp
@@ -121,7 +121,7 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
     double result =
         fputil::polyeval(xsq, 1.0, -0x1.55555555554c6p-3, 0x1.1111111085e65p-7,
                          -0x1.a019f70fb4d4fp-13, 0x1.718d179815e74p-19);
-    return xd * result;
+    return static_cast<float>(xd * result);
   }
 
   if (LIBC_UNLIKELY(x_abs == 0x4619'9998U)) { // x = 0x1.33333p13
@@ -150,8 +150,8 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
 
   sincosf_eval(xd, x_abs, sin_k, cos_k, sin_y, cosm1_y);
 
-  return fputil::multiply_add(sin_y, cos_k,
-                              fputil::multiply_add(cosm1_y, sin_k, sin_k));
+  return static_cast<float>(fputil::multiply_add(
+      sin_y, cos_k, fputil::multiply_add(cosm1_y, sin_k, sin_k)));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/sinhf.cpp b/libc/src/math/generic/sinhf.cpp
index cc3811deeb9de..92edd4ea6a98f 100644
--- a/libc/src/math/generic/sinhf.cpp
+++ b/libc/src/math/generic/sinhf.cpp
@@ -21,7 +21,8 @@ LLVM_LIBC_FUNCTION(float, sinhf, (float x)) {
 
   // |x| <= 2^-26
   if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
-    return LIBC_UNLIKELY(x_abs == 0) ? x : (x + 0.25 * x * x * x);
+    return static_cast<float>(
+        LIBC_UNLIKELY(x_abs == 0) ? x : (x + 0.25 * x * x * x));
   }
 
   // When |x| >= 90, or x is inf or nan
@@ -65,11 +66,11 @@ LLVM_LIBC_FUNCTION(float, sinhf, (float x)) {
     // Therefore, output of Sollya = x * pe;
     double pe = fputil::polyeval(x2, 0.0, 0x1.5555555556583p-3,
                                  0x1.111110d239f1fp-7, 0x1.a02b5a284013cp-13);
-    return fputil::multiply_add(xdbl, pe, xdbl);
+    return static_cast<float>(fputil::multiply_add(xdbl, pe, xdbl));
   }
 
   // sinh(x) = (e^x - e^(-x)) / 2.
-  return exp_pm_eval</*is_sinh*/ true>(x);
+  return static_cast<float>(exp_pm_eval</*is_sinh*/ true>(x));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/tanf.cpp b/libc/src/math/generic/tanf.cpp
index 217664f8b2acb..681f16177fde5 100644
--- a/libc/src/math/generic/tanf.cpp
+++ b/libc/src/math/generic/tanf.cpp
@@ -90,7 +90,7 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) {
     double result =
         fputil::polyeval(xsq, 1.0, 0x1.555555553d022p-2, 0x1.111111ce442c1p-3,
                          0x1.ba180a6bbdecdp-5, 0x1.69c0a88a0b71fp-6);
-    return xd * result;
+    return static_cast<float>(xd * result);
   }
 
   // Check for exceptional values
@@ -134,8 +134,9 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) {
   // tan(x) = sin(x) / cos(x)
   //        = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
   using fputil::multiply_add;
-  return multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
-         multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k));
+  return static_cast<float>(
+      multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
+      multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/tanhf.cpp b/libc/src/math/generic/tanhf.cpp
index e67eadf3ce4b7..3b8506f809c3b 100644
--- a/libc/src/math/generic/tanhf.cpp
+++ b/libc/src/math/generic/tanhf.cpp
@@ -22,8 +22,8 @@ LLVM_LIBC_FUNCTION(float, tanhf, (float x)) {
 
   // |x| <= 2^-26
   if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
-    return LIBC_UNLIKELY(x_abs == 0) ? x
-                                     : (x - 0x1.5555555555555p-2 * x * x * x);
+    return static_cast<float>(
+        LIBC_UNLIKELY(x_abs == 0) ? x : (x - 0x1.5555555555555p-2 * x * x * x));
   }
 
   // When |x| >= 15, or x is inf or nan
@@ -48,7 +48,7 @@ LLVM_LIBC_FUNCTION(float, tanhf, (float x)) {
     double pe = fputil::polyeval(x2, 0.0, -0x1.5555555555555p-2,
                                  0x1.1111111111111p-3, -0x1.ba1ba1ba1ba1cp-5,
                                  0x1.664f4882c10fap-6, -0x1.226e355e6c23dp-7);
-    return fputil::multiply_add(xdbl, pe, xdbl);
+    return static_cast<float>(fputil::multiply_add(xdbl, pe, xdbl));
   }
 
   if (LIBC_UNLIKELY(xbits.bits == 0x4058'e0a3U)) {
@@ -65,7 +65,7 @@ LLVM_LIBC_FUNCTION(float, tanhf, (float x)) {
          fputil::multiply_add(ep.mh, r, 1.0);
 #else
   double exp_x = ep.mh * r;
-  return (exp_x - 1.0) / (exp_x + 1.0);
+  return static_cast<float>((exp_x - 1.0) / (exp_x + 1.0));
 #endif // LIBC_TARGET_CPU_HAS_FMA
 }
 

From 5f883cdbfbe216ec184194114676075f3633e08b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 23 Mar 2023 10:19:10 -0700
Subject: [PATCH 152/208] [docs] Document -fomit-frame-pointer

Close #61322

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D146603
---
 clang/include/clang/Driver/Options.td | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index b50dfd6f35510..821e86c0260f3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2638,7 +2638,11 @@ defm objc_avoid_heapify_local_blocks : BoolFOption<"objc-avoid-heapify-local-blo
   NegFlag<SetFalse, [], "Don't try">,
   BothFlags<[CC1Option, NoDriverOption], " to avoid heapifying local blocks">>;
 
-def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>;
+def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>,
+  HelpText<"Omit the frame pointer from functions that don't need it. "
+  "Some stack unwinding cases, such as profilers and sanitizers, may prefer specifying -fno-omit-frame-pointer. "
+  "On many targets, -O1 and higher omit the frame pointer by default. "
+  "-m[no-]omit-leaf-frame-pointer takes precedence for leaf functions">;
 def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, FlangOption, FC1Option]>,
   HelpText<"Parse OpenMP pragmas and generate parallel code.">;
 def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>, Flags<[NoArgumentUnused]>;

From 40e5d212cffd2b87f688dd441cd7c7f4084d407d Mon Sep 17 00:00:00 2001
From: Viktoriia Bakalova <bakalova@google.com>
Date: Thu, 23 Mar 2023 17:27:10 +0000
Subject: [PATCH 153/208] [clangd] Fix indentation in HoverTests.cpp

---
 .../clangd/unittests/HoverTests.cpp           | 66 +++++++++----------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 6ee9384204036..728f5444014dc 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -2892,50 +2892,50 @@ TEST(Hover, Providers) {
     const char *Code;
     const std::function<void(HoverInfo &)> ExpectedBuilder;
   } Cases[] = {{R"cpp(
-                            struct Foo {};                     
-                            Foo F = Fo^o{};
-                          )cpp",
+                  struct Foo {};                     
+                  Foo F = Fo^o{};
+                )cpp",
                 [](HoverInfo &HI) { HI.Provider = ""; }},
                {R"cpp(
-                            #include "foo.h"                   
-                            Foo F = Fo^o{};
-                          )cpp",
+                  #include "foo.h"                   
+                  Foo F = Fo^o{};
+                )cpp",
                 [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
                {R"cpp(
-                            #include "all.h"  
-                            Foo F = Fo^o{};
-                          )cpp",
+                  #include "all.h"  
+                  Foo F = Fo^o{};
+                )cpp",
                 [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
                {R"cpp(
-                            #define FOO 5
-                            int F = ^FOO;
-                          )cpp",
+                  #define FOO 5
+                  int F = ^FOO;
+                )cpp",
                 [](HoverInfo &HI) { HI.Provider = ""; }},
                {R"cpp(
-                            #include "foo.h"
-                            int F = ^FOO;
-                          )cpp",
+                  #include "foo.h"
+                  int F = ^FOO;
+                )cpp",
                 [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
                {R"cpp(
-                            #include "all.h"
-                            int F = ^FOO;
-                          )cpp",
+                  #include "all.h"
+                  int F = ^FOO;
+                )cpp",
                 [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
                {R"cpp(
-                            #include "foo.h"    
-                            Foo A;
-                            Foo B;
-                            Foo C = A ^+ B;
-                          )cpp",
+                  #include "foo.h"    
+                  Foo A;
+                  Foo B;
+                  Foo C = A ^+ B;
+                )cpp",
                 [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }},
                // Hover selects the underlying decl of the using decl
                {R"cpp(
-                            #include "foo.h"
-                            namespace ns {
-                              using ::Foo;
-                            }
-                            ns::F^oo d;
-                          )cpp",
+                  #include "foo.h"
+                  namespace ns {
+                    using ::Foo;
+                  }
+                  ns::F^oo d;
+                )cpp",
                 [](HoverInfo &HI) { HI.Provider = "\"foo.h\""; }}};
 
   for (const auto &Case : Cases) {
@@ -2946,10 +2946,10 @@ TEST(Hover, Providers) {
     TU.Filename = "foo.cpp";
     TU.Code = Code.code();
     TU.AdditionalFiles["foo.h"] = guard(R"cpp(
-                                                #define FOO 1
-                                                class Foo {};
-                                                Foo& operator+(const Foo, const Foo);
-                                              )cpp");
+                                          #define FOO 1
+                                          class Foo {};
+                                          Foo& operator+(const Foo, const Foo);
+                                        )cpp");
     TU.AdditionalFiles["all.h"] = guard("#include \"foo.h\"");
 
     auto AST = TU.build();

From 40aaa272f145e633b29d5e70a4590cc425801f7e Mon Sep 17 00:00:00 2001
From: Hristo Hristov <zingam@outlook.com>
Date: Wed, 22 Mar 2023 23:24:22 +0200
Subject: [PATCH 154/208] [libc++][ranges] P2711R1 Making multi-param
 constructors of views explicit

Implemented [[ https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2711r1.html | P2711R1 ]] for existing views.
 (`join_with_view` is not yet implemented)

Reviewed By: #libc, philnik

Differential Revision: https://reviews.llvm.org/D144822
---
 libcxx/docs/Status/Cxx2b.rst                  |  1 +
 libcxx/docs/Status/Cxx2bPapers.csv            |  2 +-
 libcxx/include/__config                       |  6 +++
 libcxx/include/__ranges/drop_view.h           |  2 +-
 libcxx/include/__ranges/drop_while_view.h     |  2 +-
 libcxx/include/__ranges/filter_view.h         |  6 +--
 libcxx/include/__ranges/iota_view.h           |  8 ++--
 libcxx/include/__ranges/lazy_split_view.h     |  4 +-
 libcxx/include/__ranges/split_view.h          |  5 ++-
 libcxx/include/__ranges/take_view.h           |  3 +-
 libcxx/include/__ranges/take_while_view.h     |  2 +-
 libcxx/include/__ranges/transform_view.h      |  2 +-
 .../range.drop.while/ctor.view.pass.cpp       | 21 ++++++++-
 .../range.drop/ctor.view.pass.cpp             | 17 +++++++-
 .../ranges/range.adaptors/range.drop/types.h  | 11 +++++
 .../range.filter/ctor.view_pred.pass.cpp      | 35 ++++++++-------
 .../range.lazy.split/ctor.range.pass.cpp      | 20 ++++++++-
 .../range.lazy.split/ctor.view.pass.cpp       | 28 +++++++++---
 .../range.split/ctor.range.pass.cpp           | 21 ++++++++-
 .../range.split/ctor.view.pass.cpp            | 25 +++++++++--
 .../range.take.while/ctor.view.pass.cpp       | 21 ++++++++-
 .../range.take/ctor.view_count.pass.cpp       | 21 +++++++--
 .../ranges/range.adaptors/range.take/types.h  | 11 +++++
 .../ctor.view_function.pass.cpp               | 32 ++++++++------
 .../range.iota.view/ctor.first.last.pass.cpp  | 43 ++++++++++++++++++-
 .../range.iota.view/ctor.value.bound.pass.cpp | 41 +++++++++++++++++-
 26 files changed, 321 insertions(+), 69 deletions(-)

diff --git a/libcxx/docs/Status/Cxx2b.rst b/libcxx/docs/Status/Cxx2b.rst
index 471b992fdc03b..3fbbb10f3e30b 100644
--- a/libcxx/docs/Status/Cxx2b.rst
+++ b/libcxx/docs/Status/Cxx2b.rst
@@ -43,6 +43,7 @@ Paper Status
    .. [#note-P1413R3] P1413R3: ``std::aligned_storage_t`` and ``std::aligned_union_t`` are marked deprecated, but
       clang doesn't issue a diagnostic for deprecated using template declarations.
    .. [#note-P2520R0] P2520R0: Libc++ implemented this paper as a DR in C++20 as well.
+   .. [#note-P2711R1] P2711R1: ``join_with_view`` hasn't been done yet since this type isn't implemented yet.
 
 .. _issues-status-cxx2b:
 
diff --git a/libcxx/docs/Status/Cxx2bPapers.csv b/libcxx/docs/Status/Cxx2bPapers.csv
index e51ee27deb3c1..900130cfdd506 100644
--- a/libcxx/docs/Status/Cxx2bPapers.csv
+++ b/libcxx/docs/Status/Cxx2bPapers.csv
@@ -108,7 +108,7 @@
 "`P0290R4 <https://wg21.link/P0290R4>`__","LWG", "``apply()`` for ``synchronized_value<T>``","February 2023","","","|concurrency TS|"
 "`P2770R0 <https://wg21.link/P2770R0>`__","LWG", "Stashing stashing ``iterators`` for proper flattening","February 2023","","","|ranges|"
 "`P2164R9 <https://wg21.link/P2164R9>`__","LWG", "``views::enumerate``","February 2023","","","|ranges|"
-"`P2711R1 <https://wg21.link/P2711R1>`__","LWG", "Making multi-param constructors of ``views`` ``explicit``","February 2023","","","|ranges|"
+"`P2711R1 <https://wg21.link/P2711R1>`__","LWG", "Making multi-param constructors of ``views`` ``explicit``","February 2023","|Partial| [#note-P2711R1]_","","|ranges|"
 "`P2609R3 <https://wg21.link/P2609R3>`__","LWG", "Relaxing Ranges Just A Smidge","February 2023","","","|ranges|"
 "`P2713R1 <https://wg21.link/P2713R1>`__","LWG", "Escaping improvements in ``std::format``","February 2023","","","|format|"
 "`P2675R1 <https://wg21.link/P2675R1>`__","LWG", "``format``'s width estimation is too approximate and not forward compatible","February 2023","","","|format|"
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 3d3664eb83ae7..b9076073ab250 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -847,6 +847,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_END_NAMESPACE_STD
 #    define _LIBCPP_EXPLICIT_SINCE_CXX14 explicit
 #  endif
 
+#  if _LIBCPP_STD_VER >= 23
+#    define _LIBCPP_EXPLICIT_SINCE_CXX23 explicit
+#  else
+#    define _LIBCPP_EXPLICIT_SINCE_CXX23
+#  endif
+
 #  if _LIBCPP_STD_VER >= 14
 #    define _LIBCPP_CONSTEXPR_SINCE_CXX14 constexpr
 #  else
diff --git a/libcxx/include/__ranges/drop_view.h b/libcxx/include/__ranges/drop_view.h
index b97505b578552..87668c341cb67 100644
--- a/libcxx/include/__ranges/drop_view.h
+++ b/libcxx/include/__ranges/drop_view.h
@@ -74,7 +74,7 @@ namespace ranges {
     drop_view() requires default_initializable<_View> = default;
 
     _LIBCPP_HIDE_FROM_ABI
-    constexpr drop_view(_View __base, range_difference_t<_View> __count)
+    constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 drop_view(_View __base, range_difference_t<_View> __count)
       : __count_(__count)
       , __base_(std::move(__base))
     {
diff --git a/libcxx/include/__ranges/drop_while_view.h b/libcxx/include/__ranges/drop_while_view.h
index 7c28992f18742..518feae4e2a98 100644
--- a/libcxx/include/__ranges/drop_while_view.h
+++ b/libcxx/include/__ranges/drop_while_view.h
@@ -51,7 +51,7 @@ class drop_while_view : public view_interface<drop_while_view<_View, _Pred>> {
     requires default_initializable<_View> && default_initializable<_Pred>
   = default;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr drop_while_view(_View __base, _Pred __pred)
+  _LIBCPP_HIDE_FROM_ABI constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 drop_while_view(_View __base, _Pred __pred)
       : __base_(std::move(__base)), __pred_(std::in_place, std::move(__pred)) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr _View base() const&
diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h
index bf1481b7f9156..28d08c8a67e3b 100644
--- a/libcxx/include/__ranges/filter_view.h
+++ b/libcxx/include/__ranges/filter_view.h
@@ -64,10 +64,8 @@ namespace ranges {
     _LIBCPP_HIDE_FROM_ABI
     filter_view() requires default_initializable<_View> && default_initializable<_Pred> = default;
 
-    _LIBCPP_HIDE_FROM_ABI
-    constexpr filter_view(_View __base, _Pred __pred)
-      : __base_(std::move(__base)), __pred_(in_place, std::move(__pred))
-    { }
+    _LIBCPP_HIDE_FROM_ABI constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 filter_view(_View __base, _Pred __pred)
+        : __base_(std::move(__base)), __pred_(in_place, std::move(__pred)) {}
 
     template<class _Vp = _View>
     _LIBCPP_HIDE_FROM_ABI
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index 67cf0b73ecd65..57139426724bd 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -314,7 +314,7 @@ namespace ranges {
     constexpr explicit iota_view(_Start __value) : __value_(std::move(__value)) { }
 
     _LIBCPP_HIDE_FROM_ABI
-    constexpr iota_view(type_identity_t<_Start> __value, type_identity_t<_BoundSentinel> __bound_sentinel)
+    constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 iota_view(type_identity_t<_Start> __value, type_identity_t<_BoundSentinel> __bound_sentinel)
         : __value_(std::move(__value)), __bound_sentinel_(std::move(__bound_sentinel)) {
       // Validate the precondition if possible.
       if constexpr (totally_ordered_with<_Start, _BoundSentinel>) {
@@ -324,17 +324,17 @@ namespace ranges {
     }
 
     _LIBCPP_HIDE_FROM_ABI
-    constexpr iota_view(__iterator __first, __iterator __last)
+    constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 iota_view(__iterator __first, __iterator __last)
       requires same_as<_Start, _BoundSentinel>
     : iota_view(std::move(__first.__value_), std::move(__last.__value_)) {}
 
     _LIBCPP_HIDE_FROM_ABI
-    constexpr iota_view(__iterator __first, _BoundSentinel __last)
+    constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 iota_view(__iterator __first, _BoundSentinel __last)
       requires same_as<_BoundSentinel, unreachable_sentinel_t>
     : iota_view(std::move(__first.__value_), std::move(__last)) {}
 
     _LIBCPP_HIDE_FROM_ABI
-    constexpr iota_view(__iterator __first, __sentinel __last)
+    constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 iota_view(__iterator __first, __sentinel __last)
       requires(!same_as<_Start, _BoundSentinel> && !same_as<_Start, unreachable_sentinel_t>)
     : iota_view(std::move(__first.__value_), std::move(__last.__bound_sentinel_)) {}
 
diff --git a/libcxx/include/__ranges/lazy_split_view.h b/libcxx/include/__ranges/lazy_split_view.h
index b5b0e7ef02307..186a0af320f14 100644
--- a/libcxx/include/__ranges/lazy_split_view.h
+++ b/libcxx/include/__ranges/lazy_split_view.h
@@ -82,14 +82,14 @@ class lazy_split_view : public view_interface<lazy_split_view<_View, _Pattern>>
     requires default_initializable<_View> && default_initializable<_Pattern> = default;
 
   _LIBCPP_HIDE_FROM_ABI
-  constexpr lazy_split_view(_View __base, _Pattern __pattern)
+  constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 lazy_split_view(_View __base, _Pattern __pattern)
     : __base_(std::move(__base)), __pattern_(std::move(__pattern)) {}
 
   template <input_range _Range>
     requires constructible_from<_View, views::all_t<_Range>> &&
              constructible_from<_Pattern, single_view<range_value_t<_Range>>>
   _LIBCPP_HIDE_FROM_ABI
-  constexpr lazy_split_view(_Range&& __r, range_value_t<_Range> __e)
+  constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 lazy_split_view(_Range&& __r, range_value_t<_Range> __e)
     : __base_(views::all(std::forward<_Range>(__r)))
     , __pattern_(views::single(std::move(__e))) {}
 
diff --git a/libcxx/include/__ranges/split_view.h b/libcxx/include/__ranges/split_view.h
index 6ebe5a43ed228..a27ac4ef7a196 100644
--- a/libcxx/include/__ranges/split_view.h
+++ b/libcxx/include/__ranges/split_view.h
@@ -75,13 +75,14 @@ class split_view : public view_interface<split_view<_View, _Pattern>> {
     requires default_initializable<_View> && default_initializable<_Pattern>
   = default;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr split_view(_View __base, _Pattern __pattern)
+  _LIBCPP_HIDE_FROM_ABI constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 split_view(_View __base, _Pattern __pattern)
       : __base_(std::move(__base)), __pattern_(std::move((__pattern))) {}
 
   template <forward_range _Range>
     requires constructible_from<_View, views::all_t<_Range>> &&
                  constructible_from<_Pattern, single_view<range_value_t<_Range>>>
-  _LIBCPP_HIDE_FROM_ABI constexpr split_view(_Range&& __range, range_value_t<_Range> __elem)
+  _LIBCPP_HIDE_FROM_ABI constexpr _LIBCPP_EXPLICIT_SINCE_CXX23
+  split_view(_Range&& __range, range_value_t<_Range> __elem)
       : __base_(views::all(std::forward<_Range>(__range))), __pattern_(views::single(std::move(__elem))) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr _View base() const&
diff --git a/libcxx/include/__ranges/take_view.h b/libcxx/include/__ranges/take_view.h
index ec859e920ff17..111e7e5ba2516 100644
--- a/libcxx/include/__ranges/take_view.h
+++ b/libcxx/include/__ranges/take_view.h
@@ -67,7 +67,8 @@ class take_view : public view_interface<take_view<_View>> {
   _LIBCPP_HIDE_FROM_ABI
   take_view() requires default_initializable<_View> = default;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr take_view(_View __base, range_difference_t<_View> __count)
+  _LIBCPP_HIDE_FROM_ABI
+  constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 take_view(_View __base, range_difference_t<_View> __count)
       : __base_(std::move(__base)), __count_(__count) {
     _LIBCPP_ASSERT(__count >= 0, "count has to be greater than or equal to zero");
   }
diff --git a/libcxx/include/__ranges/take_while_view.h b/libcxx/include/__ranges/take_while_view.h
index 77d7390dceb9c..d1f1bfe75411f 100644
--- a/libcxx/include/__ranges/take_while_view.h
+++ b/libcxx/include/__ranges/take_while_view.h
@@ -67,7 +67,7 @@ class take_while_view : public view_interface<take_while_view<_View, _Pred>> {
     requires default_initializable<_View> && default_initializable<_Pred>
   = default;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr take_while_view(_View __base, _Pred __pred)
+  _LIBCPP_HIDE_FROM_ABI constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 take_while_view(_View __base, _Pred __pred)
       : __base_(std::move(__base)), __pred_(std::in_place, std::move(__pred)) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr _View base() const&
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index a71350f0c99dc..14bd400e6d079 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -71,7 +71,7 @@ class transform_view : public view_interface<transform_view<_View, _Fn>> {
     requires default_initializable<_View> && default_initializable<_Fn> = default;
 
   _LIBCPP_HIDE_FROM_ABI
-  constexpr transform_view(_View __base, _Fn __func)
+  constexpr _LIBCPP_EXPLICIT_SINCE_CXX23 transform_view(_View __base, _Fn __func)
     : __func_(std::in_place, std::move(__func)), __base_(std::move(__base)) {}
 
   _LIBCPP_HIDE_FROM_ABI
diff --git a/libcxx/test/std/ranges/range.adaptors/range.drop.while/ctor.view.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.drop.while/ctor.view.pass.cpp
index cf9f9dbca9a56..326cabd637089 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.drop.while/ctor.view.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.drop.while/ctor.view.pass.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr drop_while_view(V base, Pred pred);
+// constexpr drop_while_view(V base, Pred pred); // explicit since C++23
 
 #include <cassert>
 #include <ranges>
@@ -16,6 +16,8 @@
 #include <utility>
 
 #include "MoveOnly.h"
+#include "test_convertible.h"
+#include "test_macros.h"
 
 struct View : std::ranges::view_base {
   MoveOnly mo;
@@ -32,9 +34,23 @@ struct Pred {
   bool operator()(int) const;
 };
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::drop_while_view<View, Pred>, View, Pred>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert( test_convertible<std::ranges::drop_while_view<View, Pred>, View, Pred>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   {
-    std::ranges::drop_while_view<View, Pred> dwv = {View{{}, MoveOnly{5}}, Pred{}};
+    std::ranges::drop_while_view<View, Pred> dwv{View{{}, MoveOnly{5}}, Pred{}};
     assert(dwv.pred().moved);
     assert(!dwv.pred().copied);
     assert(std::move(dwv).base().mo.get() == 5);
@@ -45,5 +61,6 @@ constexpr bool test() {
 int main(int, char**) {
   test();
   static_assert(test());
+
   return 0;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.drop/ctor.view.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.drop/ctor.view.pass.cpp
index 504021aa9cc48..4f4257f9102cb 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.drop/ctor.view.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.drop/ctor.view.pass.cpp
@@ -8,13 +8,28 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr drop_view(V base, range_difference_t<V> count);
+// constexpr drop_view(V base, range_difference_t<V> count);  // explicit since C++23
 
 #include <ranges>
 
+#include "test_convertible.h"
 #include "test_macros.h"
 #include "types.h"
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::drop_view<View>, View, std::ranges::range_difference_t<View>>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert(test_convertible<std::ranges::drop_view<View>, View, std::ranges::range_difference_t<View>>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   std::ranges::drop_view dropView1(MoveOnlyView(), 4);
   assert(dropView1.size() == 4);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.drop/types.h b/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
index b32c534a37e43..32bbddc05ed97 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
@@ -94,4 +94,15 @@ struct CountedView : std::ranges::view_base {
   constexpr CountedIter end() const { return CountedIter(ForwardIter(globalBuff + 8)); }
 };
 
+struct View : std::ranges::view_base {
+  constexpr explicit View(int* b, int* e) : begin_(b), end_(e) { }
+
+  constexpr int* begin() const { return begin_; }
+  constexpr int* end() const { return end_; }
+
+private:
+  int* begin_;
+  int* end_;
+};
+
 #endif // TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_DROP_TYPES_H
diff --git a/libcxx/test/std/ranges/range.adaptors/range.filter/ctor.view_pred.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.filter/ctor.view_pred.pass.cpp
index 644f8deba6e85..3ccab93397147 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.filter/ctor.view_pred.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.filter/ctor.view_pred.pass.cpp
@@ -8,12 +8,14 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr filter_view(View, Pred);
-
-#include <ranges>
+// constexpr filter_view(View, Pred); // explicit since C++23
 
 #include <cassert>
+#include <ranges>
 #include <utility>
+
+#include "test_convertible.h"
+#include "test_macros.h"
 #include "types.h"
 
 struct Range : std::ranges::view_base {
@@ -41,6 +43,20 @@ struct TrackingRange : TrackInitialization, std::ranges::view_base {
   int* end() const;
 };
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::filter_view<Range, Pred>, Range, Pred>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert( test_convertible<std::ranges::filter_view<Range, Pred>, Range, Pred>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   int buff[] = {1, 2, 3, 4, 5, 6, 7, 8};
 
@@ -57,19 +73,6 @@ constexpr bool test() {
     assert(it == end);
   }
 
-  // Test implicit syntax
-  {
-    Range range(buff, buff + 8);
-    Pred pred;
-    std::ranges::filter_view<Range, Pred> view = {range, pred};
-    auto it = view.begin(), end = view.end();
-    assert(*it++ == 1);
-    assert(*it++ == 3);
-    assert(*it++ == 5);
-    assert(*it++ == 7);
-    assert(it == end);
-  }
-
   // Make sure we move the view
   {
     bool moved = false, copied = false;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
index ebf1356afff60..91df304b79af7 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
@@ -11,7 +11,7 @@
 // template <input_range Range>
 //   requires constructible_from<View, views::all_t<Range>> &&
 //             constructible_from<Pattern, single_view<range_value_t<Range>>>
-// constexpr lazy_split_view(Range&& r, range_value_t<Range> e);
+// constexpr lazy_split_view(Range&& r, range_value_t<Range> e); // explicit since C++23
 
 #include <ranges>
 
@@ -20,6 +20,8 @@
 #include <string_view>
 #include <type_traits>
 #include <utility>
+
+#include "test_convertible.h"
 #include "types.h"
 
 struct ElementWithCounting {
@@ -88,6 +90,22 @@ static_assert( std::ranges::random_access_range<StrView>);
 static_assert( std::ranges::view<StrView>);
 static_assert( std::is_copy_constructible_v<StrView>);
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(
+    !test_convertible<std::ranges::lazy_split_view<StrView, StrView>, StrView, std::ranges::range_value_t<StrView>>(),
+    "This constructor must be explicit");
+
+#else
+
+static_assert(
+    test_convertible<std::ranges::lazy_split_view<StrView, StrView>, StrView, std::ranges::range_value_t<StrView>>(),
+    "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   {
     using V = std::ranges::lazy_split_view<StrView, StrView>;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.view.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.view.pass.cpp
index 264e883beeaea..e7bf052a7e9ee 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.view.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.view.pass.cpp
@@ -8,13 +8,14 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr lazy_split_view(View base, Pattern pattern);
-
-#include <ranges>
+// constexpr lazy_split_view(View base, Pattern pattern); // explicit since C++23
 
 #include <cassert>
+#include <ranges>
 #include <string_view>
 #include <utility>
+
+#include "test_convertible.h"
 #include "types.h"
 
 struct ViewWithCounting : std::ranges::view_base {
@@ -41,9 +42,27 @@ struct ViewWithCounting : std::ranges::view_base {
   constexpr ViewWithCounting& operator=(ViewWithCounting&&) = default;
   constexpr bool operator==(const ViewWithCounting&) const { return true; }
 };
+
 static_assert(std::ranges::forward_range<ViewWithCounting>);
 static_assert(std::ranges::view<ViewWithCounting>);
 
+using View = ViewWithCounting;
+using Pattern = ViewWithCounting;
+
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::lazy_split_view<View, Pattern>, View, Pattern>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert( test_convertible<std::ranges::lazy_split_view<View, Pattern>, View, Pattern>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   // Calling the constructor with `(ForwardView, ForwardView)`.
   {
@@ -62,9 +81,6 @@ constexpr bool test() {
 
   // Make sure the arguments are moved, not copied.
   {
-    using View = ViewWithCounting;
-    using Pattern = ViewWithCounting;
-
     // Arguments are lvalues.
     {
       int view_copied = 0, view_moved = 0, pattern_copied = 0, pattern_moved = 0;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.split/ctor.range.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.split/ctor.range.pass.cpp
index 605e3d544b2d8..bbe08befdb419 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.split/ctor.range.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.split/ctor.range.pass.cpp
@@ -11,7 +11,7 @@
 // template <input_range Range>
 //   requires constructible_from<View, views::all_t<Range>> &&
 //             constructible_from<Pattern, single_view<range_value_t<Range>>>
-// constexpr split_view(Range&& r, range_value_t<Range> e);
+// constexpr split_view(Range&& r, range_value_t<Range> e); // explicit since C++23
 
 #include <algorithm>
 #include <cassert>
@@ -21,6 +21,9 @@
 #include <type_traits>
 #include <utility>
 
+#include "test_convertible.h"
+#include "test_macros.h"
+
 struct Counting {
   int* times_copied = nullptr;
   int* times_moved  = nullptr;
@@ -68,6 +71,22 @@ static_assert(std::ranges::random_access_range<StrView>);
 static_assert(std::ranges::view<StrView>);
 static_assert(std::is_copy_constructible_v<StrView>);
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(
+    !test_convertible<std::ranges::split_view<StrView, StrView>, StrView, std::ranges::range_value_t<StrView>>(),
+    "This constructor must be explicit");
+
+# else
+
+static_assert(
+    test_convertible<std::ranges::split_view<StrView, StrView>, StrView, std::ranges::range_value_t<StrView>>(),
+    "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   {
     using V = std::ranges::split_view<StrView, StrView>;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.split/ctor.view.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.split/ctor.view.pass.cpp
index ad206ee5ed751..963f85f8c478f 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.split/ctor.view.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.split/ctor.view.pass.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr split_view(View base, Pattern pattern);
+// constexpr split_view(View base, Pattern pattern); // explicit since C++23
 
 #include <algorithm>
 #include <cassert>
@@ -16,6 +16,9 @@
 #include <string_view>
 #include <utility>
 
+#include "test_convertible.h"
+#include "test_macros.h"
+
 struct ViewWithCounting : std::ranges::view_base {
   int* times_copied = nullptr;
   int* times_moved  = nullptr;
@@ -38,6 +41,23 @@ struct ViewWithCounting : std::ranges::view_base {
   constexpr bool operator==(const ViewWithCounting&) const { return true; }
 };
 
+using View    = ViewWithCounting;
+using Pattern = ViewWithCounting;
+
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::split_view<View, Pattern>, View, Pattern>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert( test_convertible<std::ranges::split_view<View, Pattern>, View, Pattern>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   {
     std::string_view input = "abc def";
@@ -48,9 +68,6 @@ constexpr bool test() {
 
   // Make sure the arguments are moved, not copied.
   {
-    using View    = ViewWithCounting;
-    using Pattern = ViewWithCounting;
-
     // Arguments are lvalues.
     {
       int view_copied = 0, view_moved = 0, pattern_copied = 0, pattern_moved = 0;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.take.while/ctor.view.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.take.while/ctor.view.pass.cpp
index 7adeb6713680a..469b2698c8844 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.take.while/ctor.view.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.take.while/ctor.view.pass.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr take_while_view(V base, Pred pred);
+// constexpr take_while_view(V base, Pred pred); // explicit since C++23
 
 #include <cassert>
 #include <ranges>
@@ -16,6 +16,8 @@
 #include <utility>
 
 #include "MoveOnly.h"
+#include "test_convertible.h"
+#include "test_macros.h"
 
 struct View : std::ranges::view_base {
   MoveOnly mo;
@@ -32,9 +34,23 @@ struct Pred {
   bool operator()(int) const;
 };
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::take_while_view<View, Pred>, View, Pred>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert(test_convertible<std::ranges::take_while_view<View, Pred>, View, Pred>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   {
-    std::ranges::take_while_view<View, Pred> twv = {View{{}, MoveOnly{5}}, Pred{}};
+    std::ranges::take_while_view<View, Pred> twv{View{{}, MoveOnly{5}}, Pred{}};
     assert(twv.pred().moved);
     assert(!twv.pred().copied);
     assert(std::move(twv).base().mo.get() == 5);
@@ -45,5 +61,6 @@ constexpr bool test() {
 int main(int, char**) {
   test();
   static_assert(test());
+
   return 0;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.take/ctor.view_count.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.take/ctor.view_count.pass.cpp
index 63b936da98181..f37ffb0825ac1 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.take/ctor.view_count.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.take/ctor.view_count.pass.cpp
@@ -8,16 +8,31 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr take_view(V base, range_difference_t<V> count);
+// constexpr take_view(V base, range_difference_t<V> count); // explicit since C++23
 
-#include <ranges>
 #include <cassert>
+#include <ranges>
 
-#include "test_macros.h"
+#include "test_convertible.h"
 #include "test_iterators.h"
+#include "test_macros.h"
 #include "test_range.h"
 #include "types.h"
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::take_view<View>, View, std::ranges::range_difference_t<View>>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert(test_convertible<std::ranges::take_view<View>, View, std::ranges::range_difference_t<View>>(),
+              "This constructor must be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   int buffer[8] = {1, 2, 3, 4, 5, 6, 7, 8};
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.take/types.h b/libcxx/test/std/ranges/range.adaptors/range.take/types.h
index 09549a9e086f0..db80e68bb21af 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.take/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.take/types.h
@@ -54,4 +54,15 @@ static_assert(std::ranges::view<SizedRandomAccessView>);
 static_assert(std::ranges::random_access_range<SizedRandomAccessView>);
 static_assert(std::ranges::sized_range<SizedRandomAccessView>);
 
+struct View : std::ranges::view_base {
+  constexpr explicit View(int* b, int* e) : begin_(b), end_(e) { }
+
+  constexpr int* begin() const { return begin_; }
+  constexpr int* end() const { return end_; }
+
+private:
+  int* begin_;
+  int* end_;
+};
+
 #endif // TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_TAKE_TYPES_H
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/ctor.view_function.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/ctor.view_function.pass.cpp
index 7ce042603694d..63a43d189256f 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/ctor.view_function.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/ctor.view_function.pass.cpp
@@ -8,14 +8,16 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr transform_view(View, F);
+// constexpr transform_view(View, F); // explicit since C++23
 
+#include <cassert>
 #include <ranges>
 
-#include <cassert>
+#include "test_convertible.h"
+#include "test_macros.h"
 
 struct Range : std::ranges::view_base {
-  constexpr explicit Range(int* b, int* e) : begin_(b), end_(e) { }
+  constexpr explicit Range(int* b, int* e) : begin_(b), end_(e) {}
   constexpr int* begin() const { return begin_; }
   constexpr int* end() const { return end_; }
 
@@ -28,6 +30,20 @@ struct F {
   constexpr int operator()(int i) const { return i + 100; }
 };
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::transform_view<Range, F>, Range, F>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert( test_convertible<std::ranges::transform_view<Range, F>, Range, F>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   int buff[] = {1, 2, 3, 4, 5, 6, 7, 8};
 
@@ -41,16 +57,6 @@ constexpr bool test() {
     assert(view[7] == 108);
   }
 
-  {
-    Range range(buff, buff + 8);
-    F f;
-    std::ranges::transform_view<Range, F> view = {range, f};
-    assert(view[0] == 101);
-    assert(view[1] == 102);
-    // ...
-    assert(view[7] == 108);
-  }
-
   return true;
 }
 
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp
index 0b02cadc32609..ee0e7fceffa61 100644
--- a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp
@@ -8,14 +8,55 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// constexpr iota_view(iterator first, see below last);
+// constexpr iota_view(iterator first, see below last); // explicit since C++23
 
 #include <ranges>
 #include <cassert>
 
+#include "test_convertible.h"
 #include "test_macros.h"
 #include "types.h"
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+std::ranges::iota_view<SomeInt, SomeInt> view;
+
+static_assert(!test_convertible<std::ranges::iota_view<SomeInt, SomeInt>,
+                                decltype(std::ranges::iota_view<SomeInt, SomeInt>{}.begin()),
+                                decltype(std::ranges::iota_view<SomeInt, SomeInt>{}.end())>(),
+              "This constructor must be explicit");
+
+static_assert(!test_convertible<std::ranges::iota_view<SomeInt>,
+                                decltype(std::ranges::iota_view{SomeInt{0}}.begin()),
+                                decltype(std::unreachable_sentinel)>(),
+              "This constructor must be explicit");
+
+static_assert(!test_convertible<std::ranges::iota_view<SomeInt, IntComparableWith<SomeInt>>,
+                                decltype(std::ranges::iota_view{SomeInt(0), IntComparableWith(SomeInt(10))}.begin()),
+                                decltype(std::ranges::iota_view{SomeInt(0), IntComparableWith(SomeInt(10))}.end())>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert(test_convertible<std::ranges::iota_view<SomeInt, SomeInt>,
+                               decltype(std::ranges::iota_view<SomeInt, SomeInt>{}.begin()),
+                               decltype(std::ranges::iota_view<SomeInt, SomeInt>{}.end())>(),
+              "This constructor must not be explicit");
+
+static_assert(test_convertible<std::ranges::iota_view<SomeInt>,
+                               decltype(std::ranges::iota_view{SomeInt{0}}.begin()),
+                               decltype(std::unreachable_sentinel)>(),
+              "This constructor must not be explicit");
+
+static_assert(test_convertible<std::ranges::iota_view<SomeInt, IntComparableWith<SomeInt>>,
+                               decltype(std::ranges::iota_view{SomeInt(0), IntComparableWith(SomeInt(10))}.begin()),
+                               decltype(std::ranges::iota_view{SomeInt(0), IntComparableWith(SomeInt(10))}.end())>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   {
     std::ranges::iota_view commonView(SomeInt(0), SomeInt(10));
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp
index 906e0e092d2a6..7528e1ccf3ee0 100644
--- a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp
@@ -14,13 +14,52 @@ TEST_CLANG_DIAGNOSTIC_IGNORED("-Wsign-compare")
 TEST_GCC_DIAGNOSTIC_IGNORED("-Wsign-compare")
 TEST_MSVC_DIAGNOSTIC_IGNORED(4018 4389) // various "signed/unsigned mismatch"
 
-// constexpr iota_view(type_identity_t<W> value, type_identity_t<Bound> bound);
+// constexpr iota_view(type_identity_t<W> value, type_identity_t<Bound> bound); // explicit since C++23
 
 #include <ranges>
 #include <cassert>
 
+#include "test_convertible.h"
 #include "types.h"
 
+// SFINAE tests.
+
+#if TEST_STD_VER >= 23
+
+static_assert(!test_convertible<std::ranges::iota_view<SomeInt, SomeInt>,
+                                decltype(std::ranges::iota_view<SomeInt, SomeInt>{}.begin()),
+                                decltype(std::ranges::iota_view<SomeInt, SomeInt>{}.end())>(),
+              "This constructor must be explicit");
+
+static_assert(!test_convertible<std::ranges::iota_view<SomeInt>,
+                                decltype(std::ranges::iota_view<SomeInt>{}.begin()),
+                                decltype(std::unreachable_sentinel)>(),
+              "This constructor must be explicit");
+
+static_assert(!test_convertible<std::ranges::iota_view<SomeInt, IntComparableWith<SomeInt>>,
+                                decltype(std::ranges::iota_view{SomeInt(0), IntComparableWith(SomeInt(10))}.begin()),
+                                decltype(std::ranges::iota_view{SomeInt(0), IntComparableWith(SomeInt(10))}.end())>(),
+              "This constructor must be explicit");
+
+#else
+
+static_assert( test_convertible<std::ranges::iota_view<SomeInt, SomeInt>,
+                                decltype(std::ranges::iota_view<SomeInt, SomeInt>{}.begin()),
+                                decltype(std::ranges::iota_view<SomeInt, SomeInt>{}.end())>(),
+              "This constructor must not be explicit");
+
+static_assert( test_convertible<std::ranges::iota_view<SomeInt>,
+                                decltype(std::ranges::iota_view<SomeInt>{}.begin()),
+                                decltype(std::unreachable_sentinel)>(),
+              "This constructor must not be explicit");
+
+static_assert( test_convertible<std::ranges::iota_view<SomeInt, IntComparableWith<SomeInt>>,
+                                decltype(std::ranges::iota_view{SomeInt(0), IntComparableWith(SomeInt(10))}.begin()),
+                                decltype(std::ranges::iota_view{SomeInt(0), IntComparableWith(SomeInt(10))}.end())>(),
+              "This constructor must not be explicit");
+
+#endif // TEST_STD_VER >= 23
+
 constexpr bool test() {
   {
     std::ranges::iota_view<SomeInt, SomeInt> io(SomeInt(0), SomeInt(10));

From 101cfe18f7b80801be3e415ad62acfc57960ad90 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Thu, 23 Mar 2023 17:09:00 +0000
Subject: [PATCH 155/208] [libcxx] Fix build bustage with threads disabled

Building with -DLIBCXX_ENABLE_THREADS=OFF -DLIBCXXABI_ENABLE_THREADS=OFF
(like e.g. for wasm) fails after D146228 because of a misplaced std
namespace begin/end.

Reviewed By: philnik, #libc

Differential Revision: https://reviews.llvm.org/D146682
---
 libcxx/include/__condition_variable/condition_variable.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__condition_variable/condition_variable.h b/libcxx/include/__condition_variable/condition_variable.h
index e66f78725a08c..926effbb23e8e 100644
--- a/libcxx/include/__condition_variable/condition_variable.h
+++ b/libcxx/include/__condition_variable/condition_variable.h
@@ -29,10 +29,10 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
-#ifndef _LIBCPP_HAS_NO_THREADS
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#ifndef _LIBCPP_HAS_NO_THREADS
+
 // enum class cv_status
 _LIBCPP_DECLARE_STRONG_ENUM(cv_status){no_timeout, timeout};
 _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(cv_status)
@@ -234,10 +234,10 @@ inline void condition_variable::__do_timed_wait(unique_lock<mutex>& __lk,
   wait_for(__lk, __tp - _Clock::now());
 }
 
-_LIBCPP_END_NAMESPACE_STD
-
 #endif // _LIBCPP_HAS_NO_THREADS
 
+_LIBCPP_END_NAMESPACE_STD
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___CONDITION_VARIABLE_CONDITION_VARIABLE_H

From 6a2a5f08de0a09171bb92f91cd7b9deea97f6cce Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Thu, 23 Mar 2023 17:16:31 +0000
Subject: [PATCH 156/208] [CodeGenPrepare] Don't give up if unable to sink
 first arg to a cold call

Reviewed By: mkazantsev

Differential Revision: https://reviews.llvm.org/D143892
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  3 +-
 .../Generic/addr-sink-call-multi-arg.ll       | 34 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Generic/addr-sink-call-multi-arg.ll

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 8d11f282516c8..b571b5a8ab5f4 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2279,7 +2279,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
       if (!Arg->getType()->isPointerTy())
         continue;
       unsigned AS = Arg->getType()->getPointerAddressSpace();
-      return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
+      if (optimizeMemoryInst(CI, Arg, Arg->getType(), AS))
+        return true;
     }
 
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
diff --git a/llvm/test/CodeGen/Generic/addr-sink-call-multi-arg.ll b/llvm/test/CodeGen/Generic/addr-sink-call-multi-arg.ll
new file mode 100644
index 0000000000000..b02bdc3b57242
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/addr-sink-call-multi-arg.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; REQUIRES: aarch64-registered-target
+
+; Check that we don't give up if unable to sink the first argument.
+
+target triple = "aarch64-linux"
+
+declare void @f(ptr, ptr) cold
+
+define void @g(i1 %c1, ptr %p, i32 %i) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A0:%.*]] = getelementptr ptr, ptr [[P:%.*]], i32 [[I:%.*]]
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[P]], i64 32
+; CHECK-NEXT:    call void @f(ptr [[A0]], ptr [[SUNKADDR]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a0 = getelementptr ptr, ptr %p, i32 %i
+  %a1 = getelementptr ptr, ptr %p, i32 4
+  br i1 %c1, label %if.then, label %exit
+
+if.then:
+  call void @f(ptr %a0, ptr %a1)
+  br label %exit
+
+exit:
+  ret void
+}

From c70e360b355ad30a7dd299435aae0324c5033b3f Mon Sep 17 00:00:00 2001
From: Emilia Dreamer <emilia@rymiel.space>
Date: Thu, 23 Mar 2023 19:31:39 +0200
Subject: [PATCH 157/208] [clang-format] Allow trailing return types in macros

The trailing return type arrow checker verifies that a declaration is
being parsed, however, this isn't true when inside of macros.

It turns out the existence of the auto keyword is enough to make
sure that we're dealing with a trailing return type, and whether we're
in a declaration doesn't matter.

Fixes https://github.com/llvm/llvm-project/issues/47664

Reviewed By: HazardyKnusperkeks, owenpan

Differential Revision: https://reviews.llvm.org/D141811
---
 clang/lib/Format/TokenAnnotator.cpp   | 3 ++-
 clang/unittests/Format/FormatTest.cpp | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 55be50aec203e..5dbda8fbe0719 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1909,7 +1909,8 @@ class AnnotatingParser {
     } else if (Current.is(tok::arrow) &&
                Style.Language == FormatStyle::LK_Java) {
       Current.setType(TT_LambdaArrow);
-    } else if (Current.is(tok::arrow) && AutoFound && Line.MustBeDeclaration &&
+    } else if (Current.is(tok::arrow) && AutoFound &&
+               (Line.MustBeDeclaration || Line.InPPDirective) &&
                Current.NestingLevel == 0 &&
                !Current.Previous->isOneOf(tok::kw_operator, tok::identifier)) {
       // not auto operator->() -> xxx;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index eeb1234999a10..eb1b563b3d2c3 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -8010,6 +8010,11 @@ TEST_F(FormatTest, TrailingReturnType) {
                "auto aaaaaaaaaaaaaaaaaaaaaa(T t)\n"
                "    -> decltype(eaaaaaaaaaaaaaaa<T>(t.a).aaaaaaaa());");
 
+  FormatStyle Style = getLLVMStyleWithColumns(60);
+  verifyFormat("#define MAKE_DEF(NAME)                                     \\\n"
+               "  auto NAME() -> int { return 42; }",
+               Style);
+
   // Not trailing return types.
   verifyFormat("void f() { auto a = b->c(); }");
   verifyFormat("auto a = p->foo();");

From 8088f5bf2dc051dc0828990b3df2a3299c9f0433 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Thu, 23 Mar 2023 17:13:35 +0000
Subject: [PATCH 158/208] [support] Fix PrintNumber Test on AIX

When fixing the test earlier, we missed the JSON case for NaN and INF,
so handle those the same as for non-JSON, by creating the string
dynamically.

Reviewed By: abhina.sreeskantharajan

Differential Revision: https://reviews.llvm.org/D146739
---
 llvm/unittests/Support/ScopedPrinterTest.cpp | 50 ++++++++++----------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/llvm/unittests/Support/ScopedPrinterTest.cpp b/llvm/unittests/Support/ScopedPrinterTest.cpp
index f62d310f25d95..9ebcb0b14bd43 100644
--- a/llvm/unittests/Support/ScopedPrinterTest.cpp
+++ b/llvm/unittests/Support/ScopedPrinterTest.cpp
@@ -510,7 +510,16 @@ FirstSecondThirdByteMask [ (0x333)
 }
 
 TEST_F(ScopedPrinterTest, PrintNumber) {
-  auto PrintFunc = [](ScopedPrinter &W) {
+  constexpr float MaxFloat = std::numeric_limits<float>::max();
+  constexpr float MinFloat = std::numeric_limits<float>::min();
+  constexpr float InfFloat = std::numeric_limits<float>::infinity();
+  const float NaNFloat = std::nanf("1");
+  constexpr double MaxDouble = std::numeric_limits<double>::max();
+  constexpr double MinDouble = std::numeric_limits<double>::min();
+  constexpr double InfDouble = std::numeric_limits<double>::infinity();
+  const double NaNDouble = std::nan("1");
+
+  auto PrintFunc = [&](ScopedPrinter &W) {
     uint64_t Unsigned64Max = std::numeric_limits<uint64_t>::max();
     uint64_t Unsigned64Min = std::numeric_limits<uint64_t>::min();
     W.printNumber("uint64_t-max", Unsigned64Max);
@@ -556,10 +565,6 @@ TEST_F(ScopedPrinterTest, PrintNumber) {
 
     W.printNumber("label", "value", 0);
 
-    float MaxFloat = std::numeric_limits<float>::max();
-    float MinFloat = std::numeric_limits<float>::min();
-    float InfFloat = std::numeric_limits<float>::infinity();
-    float NaNFloat = std::nanf("1");
     W.printNumber("float-max", MaxFloat);
     W.printNumber("float-min", MinFloat);
     W.printNumber("float-inf", InfFloat);
@@ -567,11 +572,7 @@ TEST_F(ScopedPrinterTest, PrintNumber) {
     W.printNumber("float-42.0", 42.0f);
     W.printNumber("float-42.5625", 42.5625f);
 
-    double MaxDouble = std::numeric_limits<double>::max();
-    double MinDouble = std::numeric_limits<double>::min();
-    double InfDouble = std::numeric_limits<double>::infinity();
-    double NaNDouble = std::nan("1");
-    W.printNumber("double-max", MaxDouble);
+        W.printNumber("double-max", MaxDouble);
     W.printNumber("double-min", MinDouble);
     W.printNumber("double-inf", InfDouble);
     W.printNumber("double-nan", NaNDouble);
@@ -583,29 +584,30 @@ TEST_F(ScopedPrinterTest, PrintNumber) {
   // implementation defined behavior. So format the max float/double, instead of
   // hard coding it in the tests. Note: we can't just use std::to_string(),
   // since we format the float in PrintNumber(). This isn't required for JSON
-  // formatting, since it uses exponents, which will be consistent.
+  // formatting, since it uses exponents, which will be consistent. However,
+  // NaN and INF may be printed differently, (like AIX), so we still need to 
+  // handle those cases for JSON checking.
 
   // Allocate a buffer large enough to represent large floating point values
   // and construct the string representation for them there.
   char Buf[512];
 
-  format("%5.1f", std::numeric_limits<float>::max()).snprint(Buf, sizeof(Buf));
+  format("%5.1f", MaxFloat).snprint(Buf, sizeof(Buf));
   std::string MaxFloatStr(Buf);
 
-  format("%5.1f", std::numeric_limits<double>::max()).snprint(Buf, sizeof(Buf));
+  format("%5.1f", MaxDouble).snprint(Buf, sizeof(Buf));
   std::string MaxDoubleStr(Buf);
 
-  format("%5.1f", std::numeric_limits<double>::infinity())
-      .snprint(Buf, sizeof(Buf));
+  format("%5.1f", InfFloat).snprint(Buf, sizeof(Buf));
   std::string InfFloatStr(Buf);
 
-  std::to_string(std::numeric_limits<float>::infinity());
+  format("%5.1f", InfDouble).snprint(Buf, sizeof(Buf));
   std::string InfDoubleStr(Buf);
 
-  format("%5.1f", std::nanf("1")).snprint(Buf, sizeof(Buf));
+  format("%5.1f", NaNFloat).snprint(Buf, sizeof(Buf));
   std::string NaNFloatStr(Buf);
 
-  format("%5.1f", std::nan("1")).snprint(Buf, sizeof(Buf));
+  format("%5.1f", NaNDouble).snprint(Buf, sizeof(Buf));
   std::string NaNDoubleStr(Buf);
 
   std::string ExpectedOut = Twine(
@@ -643,7 +645,7 @@ double-42.5625:  42.6
 )")
                                 .str();
 
-  const char *JSONExpectedOut = R"({
+  std::string JSONExpectedOut = Twine(R"({
   "uint64_t-max": 18446744073709551615,
   "uint64_t-min": 0,
   "uint32_t-max": 4294967295,
@@ -667,17 +669,17 @@ double-42.5625:  42.6
   },
   "float-max": 3.4028234663852886e+38,
   "float-min": 1.1754943508222875e-38,
-  "float-inf": inf,
-  "float-nan": nan,
+  "float-inf": )" + std::to_string(InfFloat) + R"(,
+  "float-nan": )" + std::to_string(NaNFloat) + R"(,
   "float-42.0": 42,
   "float-42.5625": 42.5625,
   "double-max": 1.7976931348623157e+308,
   "double-min": 2.2250738585072014e-308,
-  "double-inf": inf,
-  "double-nan": nan,
+  "double-inf": )" + std::to_string(InfDouble) + R"(,
+  "double-nan": )" + std::to_string(NaNDouble) + R"(,
   "double-42.0": 42,
   "double-42.5625": 42.5625
-})";
+})").str();
   verifyAll(ExpectedOut, JSONExpectedOut, PrintFunc);
 }
 

From cc8a34b11b0cff9c28780401a61d1cfb9a0f8b36 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Mar 2023 17:49:39 +0000
Subject: [PATCH 159/208] [X86] Refactor movmsk(icmp_eq(and(x,c1),0)) ->
 movmsk(not(shl(x,c2))) fold to use KnownBits

We don't need an explicit AND mask, we can use KnownBits to determine if each element has (the same) single non-zero bit and shift that into the msb/signbit for MOVMSK to access directly.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 22 +++++++++----------
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll |  6 ++---
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3a4173e443798..a87dc476a1849 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54442,25 +54442,25 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
 
   // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
   // iff pow2splat(c1).
+  // Use KnownBits to determine if only a single bit is non-zero
+  // in each element (pow2 or zero), and shift that bit to the msb.
   if (Src.getOpcode() == X86ISD::PCMPEQ &&
-      Src.getOperand(0).getOpcode() == ISD::AND &&
       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
-    SDValue LHS = Src.getOperand(0).getOperand(0);
-    SDValue RHS = Src.getOperand(0).getOperand(1);
-    KnownBits KnownRHS = DAG.computeKnownBits(RHS);
-    if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {
+    KnownBits KnownSrc = DAG.computeKnownBits(Src.getOperand(0));
+    if (KnownSrc.countMaxPopulation() == 1) {
       SDLoc DL(N);
       MVT ShiftVT = SrcVT;
+      SDValue ShiftSrc = Src.getOperand(0);
       if (ShiftVT.getScalarType() == MVT::i8) {
         // vXi8 shifts - we only care about the signbit so can use PSLLW.
         ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
-        LHS = DAG.getBitcast(ShiftVT, LHS);
+        ShiftSrc = DAG.getBitcast(ShiftVT, ShiftSrc);
       }
-      unsigned ShiftAmt = KnownRHS.getConstant().countl_zero();
-      LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,
-                                       ShiftAmt, DAG);
-      LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);
-      return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);
+      unsigned ShiftAmt = KnownSrc.countMinLeadingZeros();
+      ShiftSrc = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
+                                            ShiftSrc, ShiftAmt, DAG);
+      ShiftSrc = DAG.getNOT(DL, DAG.getBitcast(SrcVT, ShiftSrc), SrcVT);
+      return DAG.getNode(X86ISD::MOVMSK, DL, VT, ShiftSrc);
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 761ad105f75dc..f22d705068150 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -863,10 +863,8 @@ define i1 @mask_v8i32(<8 x i32> %a0) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; SSE2-NEXT:    pmovmskb %xmm1, %eax
-; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;

From 5c9a26238a198c115b7a14acd80d1505438438c5 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Thu, 23 Mar 2023 17:38:07 +0000
Subject: [PATCH 160/208] [CodeGenPrepare][NFC] Pre-commit test for memory use
 count fix

Reviewed By: mkazantsev

Differential Revision: https://reviews.llvm.org/D145705
---
 llvm/test/CodeGen/Generic/addr-use-count.ll | 70 +++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/addr-use-count.ll

diff --git a/llvm/test/CodeGen/Generic/addr-use-count.ll b/llvm/test/CodeGen/Generic/addr-use-count.ll
new file mode 100644
index 0000000000000..a3b110bf60896
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/addr-use-count.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; REQUIRES: aarch64-registered-target
+
+; Test that `%addr` is sunk, even though the number of memory uses to scan exceeds the limit.
+
+target triple = "aarch64-linux"
+
+declare void @g(...)
+
+define void @f(ptr %p) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4
+; CHECK-NEXT:    [[T0:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T16:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[T18:%.*]] = load i32, ptr [[SUNKADDR]], align 4
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[P]], i64 8
+; CHECK-NEXT:    [[T19:%.*]] = load i32, ptr [[SUNKADDR1]], align 4
+; CHECK-NEXT:    call void @g(i32 [[T0]], i32 [[T1]], i32 [[T2]], i32 [[T3]], i32 [[T4]], i32 [[T5]], i32 [[T6]], i32 [[T7]], i32 [[T8]], i32 [[T9]], i32 [[T10]], i32 [[T11]], i32 [[T12]], i32 [[T13]], i32 [[T14]], i32 [[T15]], i32 [[T16]], i32 [[T17]], i32 [[T18]], i32 [[T19]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %addr = getelementptr i8, ptr %p, i32 4
+  br label %exit
+
+exit:
+  %t0 = load i32, ptr %addr
+  %t1 = load i32, ptr %addr
+  %t2 = load i32, ptr %addr
+  %t3 = load i32, ptr %addr
+  %t4 = load i32, ptr %addr
+  %t5 = load i32, ptr %addr
+  %t6 = load i32, ptr %addr
+  %t7 = load i32, ptr %addr
+  %t8 = load i32, ptr %addr
+  %t9 = load i32, ptr %addr
+  %t10 = load i32, ptr %addr
+  %t11 = load i32, ptr %addr
+  %t12 = load i32, ptr %addr
+  %t13 = load i32, ptr %addr
+  %t14 = load i32, ptr %addr
+  %t15 = load i32, ptr %addr
+  %t16 = load i32, ptr %addr
+  %t17 = load i32, ptr %addr
+  %t18 = load i32, ptr %addr
+  %addr.1 = getelementptr i8, ptr %addr, i32 4
+  %t19 = load i32, ptr %addr.1
+
+  call void @g(i32 %t0, i32 %t1, i32 %t2, i32 %t3, i32 %t4, i32 %t5, i32 %t6, i32 %t7,
+  i32 %t8, i32 %t9, i32 %t10, i32 %t11, i32 %t12, i32 %t13, i32 %t14, i32 %t15, i32 %t16,
+  i32 %t17, i32 %t18, i32 %t19)
+  ret void
+}

From 73bec2b2c3c90bc503bde5b67a239708f2c0f183 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Thu, 23 Mar 2023 10:47:04 -0700
Subject: [PATCH 161/208] [mlir][Vector] Retire one old filter-based test

Differential Revision: https://reviews.llvm.org/D146742
---
 .../Vector/vector-contract-transforms.mlir    | 27 -------------------
 .../Dialect/Vector/TestVectorTransforms.cpp   | 23 +---------------
 2 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
index 2cbd604759edc..e3f86ee0b39bc 100644
--- a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
@@ -1,7 +1,6 @@
 // RUN: mlir-opt %s -test-vector-contraction-lowering | FileCheck %s
 // RUN: mlir-opt %s -test-vector-contraction-lowering=vector-lower-matrix-intrinsics=1 | FileCheck %s --check-prefix=MATRIX
 // RUN: mlir-opt %s -test-vector-contraction-lowering=vector-outerproduct=1 | FileCheck %s --check-prefix=OUTERPRODUCT
-// RUN: mlir-opt %s -test-vector-contraction-lowering=vector-filter-outerproduct=1 | FileCheck %s --check-prefix=FILTEROUTERPRODUCT
 // RUN: mlir-opt %s -test-vector-contraction-lowering=vector-parallel-arith=1 | FileCheck %s --check-prefix=PARALLEL
 
 #dotp_accesses = [
@@ -1182,32 +1181,6 @@ func.func @matmul_7(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vecto
   return %0 : vector<3x2xf32>
 }
 
-// FILTEROUTERPRODUCT-LABEL: func @matmul_4_filtered
-// FILTEROUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<4x4xf32>,
-// FILTEROUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x4xf32>,
-// FILTEROUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<4x4xf32>
-//      FILTEROUTERPRODUCT: %[[c0:.*]] = vector.contract {{{.*}}} %[[A]], %[[B]], %[[C]]
-func.func @matmul_4_filtered(%arg0: vector<4x4xf32>, %arg1: vector<4x4xf32>, %arg2: vector<4x4xf32>)
--> vector<4x4xf32>
-{
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
-    : vector<4x4xf32>, vector<4x4xf32> into vector<4x4xf32>
-  return %0 : vector<4x4xf32>
-}
-
-// FILTEROUTERPRODUCT-LABEL: func @matmul_4_not_filtered
-// FILTEROUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<3x4xf32>,
-// FILTEROUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x4xf32>,
-// FILTEROUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<3x4xf32>
-//      FILTEROUTERPRODUCT: %[[c0:.*]] = vector.contract {{{.*}}} %[[A]], %[[B]], %[[C]]
-func.func @matmul_4_not_filtered(%arg0: vector<3x4xf32>, %arg1: vector<4x4xf32>, %arg2: vector<3x4xf32>)
--> vector<3x4xf32>
-{
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
-    : vector<3x4xf32>, vector<4x4xf32> into vector<3x4xf32>
-  return %0 : vector<3x4xf32>
-}
-
 // PARALLEL-LABEL: func @parrallel_contract_lowering
 //       PARALLEL:   %[[E0:.*]] = vector.extract %{{.*}}[0, 0] : vector<1x1x4xf32>
 //       PARALLEL:   %[[E1:.*]] = vector.extract %{{.*}}[0, 0] : vector<1x1x4xf32>
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 5a21bff0b39c3..f79ca2259fa38 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <type_traits>
 #include <optional>
+#include <type_traits>
 
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -136,11 +136,6 @@ struct TestVectorContractionLowering
       *this, "vector-outerproduct",
       llvm::cl::desc("Lower vector.contract to vector.outerproduct"),
       llvm::cl::init(false)};
-  Option<bool> lowerToFilterOuterProduct{
-      *this, "vector-filter-outerproduct",
-      llvm::cl::desc("Lower vector.contract to vector.outerproduct but not for "
-                     "vectors of size 4."),
-      llvm::cl::init(false)};
   Option<bool> lowerToParallelArith{
       *this, "vector-parallel-arith",
       llvm::cl::desc("Lower vector.contract to elementwise vector ops."),
@@ -159,22 +154,6 @@ struct TestVectorContractionLowering
       return;
     }
 
-    // Test on one pattern in isolation.
-    if (lowerToFilterOuterProduct) {
-      VectorContractLowering lowering = VectorContractLowering::OuterProduct;
-      VectorTransformsOptions options{lowering};
-      patterns.add<ContractionOpToOuterProductOpLowering>(
-          options, &getContext(), /*benefit=*/1, [](vector::ContractionOp op) {
-            // Only lowers vector.contract where the lhs as a type vector<MxNx?>
-            // where M is not 4.
-            if (op.getRhsType().getShape()[0] == 4)
-              return failure();
-            return success();
-          });
-      (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
-      return;
-    }
-
     if (lowerToParallelArith) {
       vector::populateVectorContractLoweringPatterns(
           patterns,

From 637048f122dc5112a86ae8c5c437efa22379127e Mon Sep 17 00:00:00 2001
From: Julian Lettner <julian.lettner@apple.com>
Date: Thu, 23 Mar 2023 11:01:33 -0700
Subject: [PATCH 162/208] [TSan][Darwin] Test fix external-swift-debugging.cpp

My recent change [1] extended the external-swift-debugging.cpp test, but
didn't account for PAC under which function pointers aren't trivially
comparable. We could use `ptrauth_strip()`, but for the test it's easier
to just the symbol name.

[1] https://reviews.llvm.org/D146264
---
 compiler-rt/test/tsan/Darwin/external-swift-debugging.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/compiler-rt/test/tsan/Darwin/external-swift-debugging.cpp b/compiler-rt/test/tsan/Darwin/external-swift-debugging.cpp
index 64475a3e97373..8f8b2d514ea0b 100644
--- a/compiler-rt/test/tsan/Darwin/external-swift-debugging.cpp
+++ b/compiler-rt/test/tsan/Darwin/external-swift-debugging.cpp
@@ -30,9 +30,6 @@ int main(int argc, char *argv[]) {
   fprintf(stderr, "Start.\n");
   // CHECK: Start.
 
-  fprintf(stderr, "ExternalWrite function address: %p\n", &ExternalWrite);
-  // CHECK: ExternalWrite function address: [[ExternalWrite_addr:0x[0-9a-z]+]]
-
   void *opaque_object = malloc(16);
   std::thread t1([opaque_object] {
     ExternalWrite(opaque_object);
@@ -85,7 +82,7 @@ __tsan_on_report(void *report) {
             info.dli_saddr, info.dli_sname);
   }
   // Ensure ExternalWrite() function is top of trace
-  // CHECK: 0: frame: 0x{{[0-9a-z]+}}, function: [[ExternalWrite_addr]] _Z13ExternalWritePv
+  // CHECK: 0: frame: 0x{{[0-9a-z]+}}, function: 0x{{[0-9a-z]+}} _Z13ExternalWritePv
 }
 
 // CHECK: Done.

From 2bc4c3e920ee078ef2879b00c40440e0867f0b9e Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Thu, 23 Mar 2023 08:32:48 -0700
Subject: [PATCH 163/208] [mlir][Vector] NFC - Reorganize vector patterns

Vector dialect patterns have grown enormously in the past year to a point where they are now impenetrable.
Start reorganizing them towards finer-grained control.

Differential Revision: https://reviews.llvm.org/D146736
---
 .../mlir/Dialect/Vector/IR/VectorOps.h        |   36 +-
 .../Vector/Transforms/LoweringPatterns.h      |  248 ++
 .../mlir/Dialect/Vector/Transforms/Passes.h   |    6 -
 .../Vector/Transforms/VectorRewritePatterns.h |  382 +--
 .../Vector/Transforms/VectorTransforms.h      |   61 +-
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |    1 +
 .../VectorToLLVM/ConvertVectorToLLVMPass.cpp  |    6 +-
 .../Conversion/VectorToSCF/VectorToSCF.cpp    |    3 +-
 .../Linalg/TransformOps/CMakeLists.txt        |    2 +-
 .../TransformOps/LinalgTransformOps.cpp       |    1 +
 .../TransformOps/VectorTransformOps.cpp       |   17 +-
 .../Dialect/Vector/Transforms/CMakeLists.txt  |   10 +-
 .../Transforms/LowerVectorBroadcast.cpp       |  156 ++
 .../Vector/Transforms/LowerVectorContract.cpp | 1329 ++++++++++
 .../Vector/Transforms/LowerVectorGather.cpp   |  173 ++
 .../Vector/Transforms/LowerVectorMask.cpp     |  144 +-
 ...orms.cpp => LowerVectorMultiReduction.cpp} |    9 +-
 .../Vector/Transforms/LowerVectorScan.cpp     |  251 ++
 .../Transforms/LowerVectorShapeCast.cpp       |  177 ++
 ...tePatterns.cpp => LowerVectorTransfer.cpp} |  245 +-
 .../Transforms/LowerVectorTranspose.cpp       |  210 ++
 .../Transforms/VectorTransferOpTransforms.cpp |    1 +
 .../VectorTransferSplitRewritePatterns.cpp    |   55 +-
 .../Vector/Transforms/VectorTransforms.cpp    | 2224 +----------------
 .../Dialect/Vector/TestVectorTransforms.cpp   |    8 +-
 .../llvm-project-overlay/mlir/BUILD.bazel     |    1 +
 26 files changed, 3103 insertions(+), 2653 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
 rename mlir/lib/Dialect/Vector/Transforms/{VectorMultiDimReductionTransforms.cpp => LowerVectorMultiReduction.cpp} (98%)
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
 rename mlir/lib/Dialect/Vector/Transforms/{VectorTransferPermutationMapRewritePatterns.cpp => LowerVectorTransfer.cpp} (57%)
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
index 56f8b4bf22d21..4763b6525b934 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
@@ -110,43 +110,11 @@ void populateFlattenVectorTransferPatterns(RewritePatternSet &patterns,
 void populateBubbleVectorBitCastOpPatterns(RewritePatternSet &patterns,
                                            PatternBenefit benefit = 1);
 
-/// Collect a set of transfer read/write lowering patterns.
-///
-/// These patterns lower transfer ops to simpler ops like `vector.load`,
-/// `vector.store` and `vector.broadcast`. Only transfers with a transfer rank
-/// of a most `maxTransferRank` are lowered. This is useful when combined with
-/// VectorToSCF, which reduces the rank of vector transfer ops.
-void populateVectorTransferLoweringPatterns(
-    RewritePatternSet &patterns,
-    std::optional<unsigned> maxTransferRank = std::nullopt,
-    PatternBenefit benefit = 1);
-
 /// These patterns materialize masks for various vector ops such as transfers.
 void populateVectorMaskMaterializationPatterns(RewritePatternSet &patterns,
                                                bool force32BitVectorIndices,
                                                PatternBenefit benefit = 1);
 
-/// Collects patterns to progressively lower vector.broadcast ops on high-D
-/// vectors to low-D vector ops.
-void populateVectorBroadcastLoweringPatterns(RewritePatternSet &patterns,
-                                             PatternBenefit benefit = 1);
-
-/// Collects patterns to progressively lower vector mask ops into elementary
-/// selection and insertion ops.
-void populateVectorMaskOpLoweringPatterns(RewritePatternSet &patterns,
-                                          PatternBenefit benefit = 1);
-
-/// Collects patterns to progressively lower vector.shape_cast ops on high-D
-/// vectors into 1-D/2-D vector ops by generating data movement extract/insert
-/// ops.
-void populateVectorShapeCastLoweringPatterns(RewritePatternSet &patterns,
-                                             PatternBenefit benefit = 1);
-
-/// Collects patterns that lower scalar vector transfer ops to memref loads and
-/// stores when beneficial.
-void populateScalarVectorTransferLoweringPatterns(RewritePatternSet &patterns,
-                                                  PatternBenefit benefit = 1);
-
 /// Returns the integer type required for subscripts in the vector dialect.
 IntegerType getVectorSubscriptType(Builder &builder);
 
@@ -214,8 +182,8 @@ void createMaskOpRegion(OpBuilder &builder, Operation *maskableOp);
 /// Creates a vector.mask operation around a maskable operation. Returns the
 /// vector.mask operation if the mask provided is valid. Otherwise, returns the
 /// maskable operation itself.
-Operation *maskOperation(OpBuilder &builder, Operation *maskableOp,
-                         Value mask, Value passthru = Value());
+Operation *maskOperation(OpBuilder &builder, Operation *maskableOp, Value mask,
+                         Value passthru = Value());
 
 /// Creates a vector select operation that picks values from `newValue` or
 /// `passthru` for each result vector lane based on `mask`. This utility is used
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
new file mode 100644
index 0000000000000..dfadffba3883b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
@@ -0,0 +1,248 @@
+//===- LoweringPatterns.h - Vector rewrite patterns --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_VECTOR_TRANSFORMS_LOWERINGPATTERNS_H
+#define MLIR_DIALECT_VECTOR_TRANSFORMS_LOWERINGPATTERNS_H
+
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
+
+namespace mlir {
+class RewritePatternSet;
+
+namespace vector {
+
+//===----------------------------------------------------------------------===//
+// Lowering pattern populate functions
+//===----------------------------------------------------------------------===//
+
+/// Populate the pattern set with the following patterns:
+///
+/// [OuterProductOpLowering]
+/// Progressively lower a `vector.outerproduct` to linearized
+/// `vector.extract` + `vector.fma` + `vector.insert`.
+///
+/// [ContractionOpLowering]
+/// Progressive lowering of ContractionOp.
+/// One:
+///   %x = vector.contract with at least one free/batch dimension
+/// is replaced by:
+///   %a = vector.contract with one less free/batch dimension
+///   %b = vector.contract with one less free/batch dimension
+///
+/// [ContractionOpToMatmulOpLowering]
+/// Progressively lower a `vector.contract` with row-major matmul semantics to
+/// linearized `vector.shape_cast` + `vector.matmul` on the way to
+/// `llvm.matrix.multiply`.
+///
+/// [ContractionOpToDotLowering]
+/// Progressively lower a `vector.contract` with row-major matmul semantics to
+/// linearized `vector.extract` + `vector.reduce` + `vector.insert`.
+///
+/// [ContractionOpToOuterProductOpLowering]
+/// Progressively lower a `vector.contract` with row-major matmul semantics to
+/// linearized `vector.extract` + `vector.outerproduct` + `vector.insert`.
+void populateVectorContractLoweringPatterns(
+    RewritePatternSet &patterns, VectorTransformsOptions options,
+    PatternBenefit benefit = 1, bool disableOuterProductLowering = false);
+
+/// Collect a set of patterns to convert vector.multi_reduction op into
+/// a sequence of vector.reduction ops. The patterns comprise:
+///
+/// [InnerOuterDimReductionConversion]
+/// Rewrites vector.multi_reduction such that all reduction dimensions are
+/// either innermost or outermost, by adding the proper vector.transpose
+/// operations.
+///
+/// [ReduceMultiDimReductionRank]
+/// Once in innermost or outermost reduction
+/// form, rewrites n-D vector.multi_reduction into 2-D vector.multi_reduction,
+/// by introducing vector.shape_cast ops to collapse + multi-reduce + expand
+/// back.
+///
+/// [TwoDimMultiReductionToElementWise]
+/// Once in 2-D vector.multi_reduction form, with an **outermost** reduction
+/// dimension, unroll the outer dimension to obtain a sequence of 1-D vector
+/// ops. This also has an opportunity for tree-reduction (in the future).
+///
+/// [TwoDimMultiReductionToReduction]
+/// Once in 2-D vector.multi_reduction form, with an **innermost** reduction
+/// dimension, unroll the outer dimension to obtain a sequence of extract +
+/// vector.reduction + insert. This can further lower to horizontal reduction
+/// ops.
+///
+/// [OneDimMultiReductionToTwoDim]
+/// For cases that reduce to 1-D vector<k> reduction (and are thus missing
+/// either a parallel or a reduction), we lift them back up to 2-D with a simple
+/// vector.shape_cast to vector<1xk> so that the other patterns can kick in,
+/// thus fully exiting out of the vector.multi_reduction abstraction.
+void populateVectorMultiReductionLoweringPatterns(
+    RewritePatternSet &patterns, VectorMultiReductionLowering options,
+    PatternBenefit benefit = 1);
+
+/// Populate the pattern set with the following patterns:
+///
+/// [TransferReadToVectorLoadLowering]
+/// Progressive lowering of BroadcastOp to ExtractOp + InsertOp + lower-D
+/// BroadcastOp until dim 1.
+void populateVectorBroadcastLoweringPatterns(RewritePatternSet &patterns,
+                                             PatternBenefit benefit = 1);
+
+/// Populate the pattern set with the following patterns:
+///
+/// [CreateMaskOp]
+/// Progressive lowering of CreateMaskOp to lower-D CreateMaskOp until dim 1.
+///
+/// [ConstantMaskOp]
+/// Progressive lowering of ConstantMaskOp to lower-D ConstantMaskOp until
+/// dim 1.
+void populateVectorMaskOpLoweringPatterns(RewritePatternSet &patterns,
+                                          PatternBenefit benefit = 1);
+
+/// Collects patterns that lower scalar vector transfer ops to memref loads and
+/// stores when beneficial.
+void populateScalarVectorTransferLoweringPatterns(RewritePatternSet &patterns,
+                                                  PatternBenefit benefit = 1);
+
+/// Populate the pattern set with the following patterns:
+///
+/// [ShapeCastOp2DDownCastRewritePattern]
+/// ShapeOp 2D -> 1D downcast serves the purpose of flattening 2-D to 1-D
+/// vectors progressively.
+///
+/// [ShapeCastOp2DUpCastRewritePattern]
+/// ShapeOp 1D -> 2D upcast serves the purpose of unflattening 2-D from 1-D
+/// vectors progressively.
+///
+/// [ShapeCastOpRewritePattern]
+/// Reference lowering to fully unrolled sequences of single element ExtractOp +
+/// InsertOp. Note that applying this pattern can almost always be considered a
+/// performance bug.
+void populateVectorShapeCastLoweringPatterns(RewritePatternSet &patterns,
+                                             PatternBenefit benefit = 1);
+
+/// Populate the pattern set with the following patterns:
+///
+/// [TransposeOpLowering]
+///
+/// [TransposeOp2DToShuffleLowering]
+///
+void populateVectorTransposeLoweringPatterns(RewritePatternSet &patterns,
+                                             VectorTransformsOptions options,
+                                             PatternBenefit benefit = 1);
+
+/// Populate the pattern set with the following patterns:
+///
+/// [TransferReadToVectorLoadLowering]
+/// Progressive lowering of transfer_read.This pattern supports lowering of
+/// `vector.transfer_read` to a combination of `vector.load` and
+/// `vector.broadcast`
+///
+/// [TransferWriteToVectorStoreLowering]
+/// Progressive lowering of transfer_write. This pattern supports lowering of
+/// `vector.transfer_write` to `vector.store`
+///
+/// [VectorLoadToMemrefLoadLowering]
+/// Replace a 0-d vector.load with a memref.load + vector.broadcast.
+///
+/// [VectorStoreToMemrefStoreLowering]
+/// Replace a 0-d vector.store with a vector.extractelement + memref.store.
+///
+/// These patterns lower transfer ops to simpler ops like `vector.load`,
+/// `vector.store` and `vector.broadcast`. Only transfers with a transfer rank
+/// of a most `maxTransferRank` are lowered. This is useful when combined with
+/// VectorToSCF, which reduces the rank of vector transfer ops.
+void populateVectorTransferLoweringPatterns(
+    RewritePatternSet &patterns,
+    std::optional<unsigned> maxTransferRank = std::nullopt,
+    PatternBenefit benefit = 1);
+
+/// Collect a set of transfer read/write lowering patterns that simplify the
+/// permutation map (e.g., converting it to a minor identity map) by inserting
+/// broadcasts and transposes. More specifically:
+///
+/// [TransferReadPermutationLowering]
+/// Lower transfer_read op with permutation into a transfer_read with a
+/// permutation map composed of leading zeros followed by a minor identity +
+/// vector.transpose op.
+/// Ex:
+///     vector.transfer_read ...
+///         permutation_map: (d0, d1, d2) -> (0, d1)
+/// into:
+///     %v = vector.transfer_read ...
+///         permutation_map: (d0, d1, d2) -> (d1, 0)
+///     vector.transpose %v, [1, 0]
+///
+///     vector.transfer_read ...
+///         permutation_map: (d0, d1, d2, d3) -> (0, 0, 0, d1, d3)
+/// into:
+///     %v = vector.transfer_read ...
+///         permutation_map: (d0, d1, d2, d3) -> (0, 0, d1, 0, d3)
+///     vector.transpose %v, [0, 1, 3, 2, 4]
+/// Note that an alternative is to transform it to linalg.transpose +
+/// vector.transfer_read to do the transpose in memory instead.
+///
+/// [TransferWritePermutationLowering]
+/// Lower transfer_write op with permutation into a transfer_write with a
+/// minor identity permutation map. (transfer_write ops cannot have broadcasts.)
+/// Ex:
+///     vector.transfer_write %v ...
+///         permutation_map: (d0, d1, d2) -> (d2, d0, d1)
+/// into:
+///     %tmp = vector.transpose %v, [2, 0, 1]
+///     vector.transfer_write %tmp ...
+///         permutation_map: (d0, d1, d2) -> (d0, d1, d2)
+///
+///     vector.transfer_write %v ...
+///         permutation_map: (d0, d1, d2, d3) -> (d3, d2)
+/// into:
+///     %tmp = vector.transpose %v, [1, 0]
+///     %v = vector.transfer_write %tmp ...
+///         permutation_map: (d0, d1, d2, d3) -> (d2, d3)
+///
+/// [TransferOpReduceRank]
+/// Lower transfer_read op with broadcast in the leading dimensions into
+/// transfer_read of lower rank + vector.broadcast.
+/// Ex: vector.transfer_read ...
+///         permutation_map: (d0, d1, d2, d3) -> (0, d1, 0, d3)
+/// into:
+///     %v = vector.transfer_read ...
+///         permutation_map: (d0, d1, d2, d3) -> (d1, 0, d3)
+///     vector.broadcast %v
+void populateVectorTransferPermutationMapLoweringPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit = 1);
+
+/// Populate the pattern set with the following patterns:
+///
+/// [ScanToArithOps]
+/// Convert vector.scan op into arith ops and vector.insert_strided_slice /
+/// vector.extract_strided_slice.
+void populateVectorScanLoweringPatterns(RewritePatternSet &patterns,
+                                        PatternBenefit benefit = 1);
+
+/// Populate the pattern set with the following patterns:
+///
+/// [FlattenGather]
+/// Flattens 2 or more dimensional `vector.gather` ops by unrolling the
+/// outermost dimension. For example:
+///
+/// [Gather1DToConditionalLoads]
+/// Turns 1-d `vector.gather` into a scalarized sequence of `vector.loads` or
+/// `tensor.extract`s. To avoid out-of-bounds memory accesses, these
+/// loads/extracts are made conditional using `scf.if` ops.
+void populateVectorGatherLoweringPatterns(RewritePatternSet &patterns,
+                                          PatternBenefit benefit = 1);
+
+/// Populates instances of `MaskOpRewritePattern` to lower masked operations
+/// with `vector.mask`. Patterns should rewrite the `vector.mask` operation and
+/// not its nested `MaskableOpInterface`.
+void populateVectorMaskLoweringPatternsForSideEffectingOps(
+    RewritePatternSet &patterns);
+
+} // namespace vector
+} // namespace mlir
+#endif // MLIR_DIALECT_VECTOR_TRANSFORMS_LOWERINGPATTERNS_H
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
index d0c06f69930d2..bf89b01e2b60c 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
@@ -22,12 +22,6 @@ std::unique_ptr<Pass> createVectorBufferizePass();
 /// Creates an instance of the `vector.mask` lowering pass.
 std::unique_ptr<Pass> createLowerVectorMaskPass();
 
-/// Populates instances of `MaskOpRewritePattern` to lower masked operations
-/// with `vector.mask`. Patterns should rewrite the `vector.mask` operation and
-/// not its nested `MaskableOpInterface`.
-void populateVectorMaskLoweringPatternsForSideEffectingOps(
-    RewritePatternSet &patterns);
-
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
index af68de7e0051e..a79bbd0be0975 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
@@ -9,8 +9,8 @@
 #ifndef MLIR_DIALECT_VECTOR_TRANSFORMS_VECTORREWRITEPATTERNS_H
 #define MLIR_DIALECT_VECTOR_TRANSFORMS_VECTORREWRITEPATTERNS_H
 
-#include <utility>
 #include <optional>
+#include <utility>
 
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransformsEnums.h.inc"
@@ -23,42 +23,7 @@ namespace mlir {
 class RewritePatternSet;
 
 namespace vector {
-
-//===----------------------------------------------------------------------===//
-// Vector transformation options exposed as auxiliary structs.
-//===----------------------------------------------------------------------===//
-/// Structure to control the behavior of vector transform patterns.
-struct VectorTransformsOptions {
-  /// Option to control the lowering of vector.contract.
-  VectorContractLowering vectorContractLowering = VectorContractLowering::Dot;
-  VectorTransformsOptions &
-  setVectorTransformsOptions(VectorContractLowering opt) {
-    vectorContractLowering = opt;
-    return *this;
-  }
-  /// Option to control the lowering of vector.multi_reduction.
-  VectorMultiReductionLowering vectorMultiReductionLowering =
-      VectorMultiReductionLowering::InnerParallel;
-  VectorTransformsOptions &
-  setVectorMultiReductionLowering(VectorMultiReductionLowering opt) {
-    vectorMultiReductionLowering = opt;
-    return *this;
-  }
-  /// Option to control the lowering of vector.transpose.
-  VectorTransposeLowering vectorTransposeLowering =
-      VectorTransposeLowering::EltWise;
-  VectorTransformsOptions &
-  setVectorTransposeLowering(VectorTransposeLowering opt) {
-    vectorTransposeLowering = opt;
-    return *this;
-  }
-  /// Option to control the splitting of vector transfers.
-  VectorTransferSplit vectorTransferSplit = VectorTransferSplit::None;
-  VectorTransformsOptions &setVectorTransferSplit(VectorTransferSplit opt) {
-    vectorTransferSplit = opt;
-    return *this;
-  }
-};
+struct VectorTransformsOptions;
 
 /// Options that control the vector unrolling.
 struct UnrollVectorOptions {
@@ -109,45 +74,6 @@ struct UnrollVectorOptions {
 // Vector transformation exposed as populate functions over rewrite patterns.
 //===----------------------------------------------------------------------===//
 
-/// Insert TransposeLowering patterns into extraction/insertion.
-void populateVectorTransposeLoweringPatterns(
-    RewritePatternSet &patterns,
-    VectorTransformsOptions options = VectorTransformsOptions(),
-    PatternBenefit benefit = 1);
-
-/// Collect a set of patterns to convert vector.multi_reduction op into
-/// a sequence of vector.reduction ops. The patterns comprise:
-/// - InnerOuterDimReductionConversion: rewrites vector.multi_reduction such
-/// that all reduction dimensions are either innermost or outermost, by adding
-/// the proper vector.transpose operations.
-/// - ReduceMultiDimReductionRank: once in innermost or outermost reduction
-/// form, rewrites n-D vector.multi_reduction into 2-D vector.multi_reduction,
-/// by introducing vector.shape_cast ops to collapse + multi-reduce + expand
-/// back.
-/// - TwoDimMultiReductionToElementWise: once in 2-D vector.multi_reduction
-/// form, with an **outermost** reduction dimension, unroll the outer dimension
-/// to obtain a sequence of 1-D vector ops. This also has an opportunity for
-/// tree-reduction (in the future).
-/// - TwoDimMultiReductionToReduction: once in 2-D vector.multi_reduction form,
-/// with an **innermost** reduction dimension, unroll the outer dimension to
-/// obtain a sequence of extract + vector.reduction + insert. This can further
-/// lower to horizontal reduction ops.
-/// - OneDimMultiReductionToTwoDim: for cases that reduce to 1-D vector<k>
-/// reduction (and are thus missing either a parallel or a reduction), we lift
-/// them back up to 2-D with a simple vector.shape_cast to vector<1xk> so that
-/// the other patterns can kick in, thus fully exiting out of the
-/// vector.multi_reduction abstraction.
-void populateVectorMultiReductionLoweringPatterns(
-    RewritePatternSet &patterns, VectorMultiReductionLowering options,
-    PatternBenefit benefit = 1);
-
-/// Collects patterns to progressively lower vector contraction ops on high-D
-/// into low-D reduction and product ops.
-void populateVectorContractLoweringPatterns(
-    RewritePatternSet &patterns,
-    VectorTransformsOptions options = VectorTransformsOptions(),
-    PatternBenefit benefit = 1);
-
 /// Canonicalization of a `vector.contraction %a, %b, %c` with row-major matmul
 /// semantics to a contraction with MMT semantics (matrix matrix multiplication
 /// with the RHS transposed). This specific form is meant to have the vector
@@ -174,67 +100,43 @@ void populateVectorContractCanonicalizeMatmulToMMT(
 void populateVectorReductionToContractPatterns(RewritePatternSet &patterns,
                                                PatternBenefit benefit = 1);
 
-/// Collect patterns to convert scan op
-void populateVectorScanLoweringPatterns(RewritePatternSet &patterns,
-                                        PatternBenefit benefit = 1);
-
-//===----------------------------------------------------------------------===//
-// Vector.transfer patterns.
-//===----------------------------------------------------------------------===//
-/// Collect a set of transfer read/write lowering patterns that simplify the
-/// permutation map (e.g., converting it to a minor identity map) by inserting
-/// broadcasts and transposes. More specifically:
-///
-/// [TransferReadPermutationLowering]
-/// Lower transfer_read op with permutation into a transfer_read with a
-/// permutation map composed of leading zeros followed by a minor identity +
-/// vector.transpose op.
-/// Ex:
-///     vector.transfer_read ...
-///         permutation_map: (d0, d1, d2) -> (0, d1)
-/// into:
-///     %v = vector.transfer_read ...
-///         permutation_map: (d0, d1, d2) -> (d1, 0)
-///     vector.transpose %v, [1, 0]
+/// Populate `patterns` with the following patterns.
 ///
-///     vector.transfer_read ...
-///         permutation_map: (d0, d1, d2, d3) -> (0, 0, 0, d1, d3)
-/// into:
-///     %v = vector.transfer_read ...
-///         permutation_map: (d0, d1, d2, d3) -> (0, 0, d1, 0, d3)
-///     vector.transpose %v, [0, 1, 3, 2, 4]
-/// Note that an alternative is to transform it to linalg.transpose +
-/// vector.transfer_read to do the transpose in memory instead.
+///   - VectorTransferFullPartialRewriter
 ///
-/// [TransferWritePermutationLowering]
-/// Lower transfer_write op with permutation into a transfer_write with a
-/// minor identity permutation map. (transfer_write ops cannot have broadcasts.)
-/// Ex:
-///     vector.transfer_write %v ...
-///         permutation_map: (d0, d1, d2) -> (d2, d0, d1)
-/// into:
-///     %tmp = vector.transpose %v, [2, 0, 1]
-///     vector.transfer_write %tmp ...
-///         permutation_map: (d0, d1, d2) -> (d0, d1, d2)
+/// Split a vector.transfer operation into an in-bounds (i.e., no out-of-bounds
+/// masking) fast path and a slow path.
 ///
-///     vector.transfer_write %v ...
-///         permutation_map: (d0, d1, d2, d3) -> (d3, d2)
-/// into:
-///     %tmp = vector.transpose %v, [1, 0]
-///     %v = vector.transfer_write %tmp ...
-///         permutation_map: (d0, d1, d2, d3) -> (d2, d3)
+/// Example (a 2-D vector.transfer_read):
+/// ```
+///    %1 = vector.transfer_read %0[...], %pad : memref<A...>, vector<...>
+/// ```
+/// is transformed into:
+/// ```
+///    %1:3 = scf.if (%inBounds) {
+///      // fast path, direct cast
+///      memref.cast %A: memref<A...> to compatibleMemRefType
+///      scf.yield %view : compatibleMemRefType, index, index
+///    } else {
+///      // slow path, not in-bounds vector.transfer or linalg.copy.
+///      memref.cast %alloc: memref<B...> to compatibleMemRefType
+///      scf.yield %4 : compatibleMemRefType, index, index
+//     }
+///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... true]}
+/// ```
+/// where `alloc` is a top of the function alloca'ed buffer of one vector.
 ///
-/// [TransferOpReduceRank]
-/// Lower transfer_read op with broadcast in the leading dimensions into
-/// transfer_read of lower rank + vector.broadcast.
-/// Ex: vector.transfer_read ...
-///         permutation_map: (d0, d1, d2, d3) -> (0, d1, 0, d3)
-/// into:
-///     %v = vector.transfer_read ...
-///         permutation_map: (d0, d1, d2, d3) -> (d1, 0, d3)
-///     vector.broadcast %v
-void populateVectorTransferPermutationMapLoweringPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit = 1);
+/// Preconditions:
+///  1. `xferOp.permutation_map()` must be a minor identity map
+///  2. the rank of the `xferOp.memref()` and the rank of the `xferOp.vector()`
+///  must be equal. This will be relaxed in the future but requires
+///  rank-reducing subviews.
+void populateVectorTransferFullPartialPatterns(
+    RewritePatternSet &patterns, const VectorTransformsOptions &options);
+
+//===----------------------------------------------------------------------===//
+// Vector.transfer patterns.
+//===----------------------------------------------------------------------===//
 
 /// Collect a set of patterns to reduce the rank of the operands of vector
 /// transfer ops to operate on the largest contigious vector.
@@ -334,220 +236,6 @@ void populateVectorUnrollPatterns(RewritePatternSet &patterns,
                                   const UnrollVectorOptions &options,
                                   PatternBenefit benefit = 1);
 
-/// Expands `vector.gather` ops into a series of conditional scalar loads
-/// (`vector.load` for memrefs or `tensor.extract` for tensors). These loads are
-/// conditional to avoid out-of-bounds memory accesses and guarded with `scf.if`
-/// ops. This lowering path is intended for targets that do not feature
-/// dedicated gather ops.
-void populateVectorGatherLoweringPatterns(RewritePatternSet &patterns,
-                                          PatternBenefit benefit = 1);
-
-//===----------------------------------------------------------------------===//
-// Finer-grained patterns exposed for more control over individual lowerings.
-//===----------------------------------------------------------------------===//
-/// Apply `splitFullAndPartialTransfer` selectively via a pattern. This pattern
-/// may take an extra filter to perform selection at a finer granularity.
-struct VectorTransferFullPartialRewriter : public RewritePattern {
-  using FilterConstraintType =
-      std::function<LogicalResult(VectorTransferOpInterface op)>;
-
-  explicit VectorTransferFullPartialRewriter(
-      MLIRContext *context,
-      VectorTransformsOptions options = VectorTransformsOptions(),
-      FilterConstraintType filter =
-          [](VectorTransferOpInterface op) { return success(); },
-      PatternBenefit benefit = 1)
-      : RewritePattern(MatchAnyOpTypeTag(), benefit, context), options(options),
-        filter(std::move(filter)) {}
-
-  /// Performs the rewrite.
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  VectorTransformsOptions options;
-  FilterConstraintType filter;
-};
-
-/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
-/// semantics to:
-/// ```
-///    %flattened_a = vector.shape_cast %a
-///    %flattened_b = vector.shape_cast %b
-///    %flattened_d = vector.matmul %flattened_a, %flattened_b
-///    %d = vector.shape_cast %%flattened_d
-///    %e = add %c, %d
-/// ```
-/// `vector.matmul` later lowers to `llvm.matrix.multiply`.
-//
-/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
-/// the vector.contract op is a row-major matrix multiply.
-class ContractionOpToMatmulOpLowering
-    : public OpRewritePattern<vector::ContractionOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  using FilterConstraintType =
-      std::function<LogicalResult(vector::ContractionOp op)>;
-
-  static LogicalResult defaultFilter(vector::ContractionOp op) {
-    return success();
-  }
-
-  ContractionOpToMatmulOpLowering(
-      vector::VectorTransformsOptions vectorTransformOptions,
-      MLIRContext *context, PatternBenefit benefit = 1,
-      FilterConstraintType constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
-        vectorTransformOptions(vectorTransformOptions),
-        filter(std::move(constraint)) {}
-
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  /// Options to control the vector patterns.
-  vector::VectorTransformsOptions vectorTransformOptions;
-  FilterConstraintType filter;
-};
-
-/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
-/// semantics to a reduction_size-unrolled sequence:
-/// ```
-///    %at = vector.transpose %a, [1, 0]
-///    %bRow0 = vector.extract %b[0]
-///    %atRow0 = vector.extract %at[0]
-///    %c0 = vector.outerproduct %atRow0, %bRow0, %c
-///    ...
-///    %bRowK = vector.extract %b[K]
-///    %atRowK = vector.extract %at[K]
-///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1
-/// ```
-///
-/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
-/// the vector.contract op is a row-major matrix multiply.
-class ContractionOpToOuterProductOpLowering
-    : public OpRewritePattern<vector::ContractionOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  using FilterConstraintType =
-      std::function<LogicalResult(vector::ContractionOp op)>;
-
-  static LogicalResult defaultFilter(vector::ContractionOp op) {
-    return success();
-  }
-
-  ContractionOpToOuterProductOpLowering(
-      vector::VectorTransformsOptions vectorTransformOptions,
-      MLIRContext *context, PatternBenefit benefit = 1,
-      FilterConstraintType constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
-        vectorTransformOptions(vectorTransformOptions),
-        filter(std::move(constraint)) {}
-
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  /// Options to control the vector patterns.
-  vector::VectorTransformsOptions vectorTransformOptions;
-  FilterConstraintType filter;
-};
-
-/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
-/// semantics to an output-size-unrolled sequence:
-/// ```
-///    %out = arith.constant ... : vector<MxNxelt_type>
-///    %bt = vector.transpose %b, [1, 0]
-///    %aRow0 = vector.extract %a[0]
-///    %btRow0 = vector.extract %bt[0]
-///    %c00 = vector.reduce %atRow0, %bRow0
-///    %out00 = vector.insert %c00, %out[0, 0]
-///    ...
-///    %aRowLast = vector.extract %at[M-1]
-///    %btRowLast = vector.extract %b[N-1]
-///    %cLastLast = vector.reduce %atRowLast, %bRowLast
-///    %outcLastLast = vector.insert %cLastLast, %out[M-1, N-1]
-/// ```
-///
-/// This only kicks in when VectorTransformsOptions is set to Dot and
-/// the vector.contract op is a row-major matmul or matvec.
-class ContractionOpToDotLowering
-    : public OpRewritePattern<vector::ContractionOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  using FilterConstraintType =
-      std::function<LogicalResult(vector::ContractionOp op)>;
-
-  static LogicalResult defaultFilter(vector::ContractionOp op) {
-    return success();
-  }
-
-  ContractionOpToDotLowering(
-      vector::VectorTransformsOptions vectorTransformOptions,
-      MLIRContext *context, PatternBenefit benefit = 1,
-      const FilterConstraintType &constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
-        vectorTransformOptions(vectorTransformOptions), filter(defaultFilter) {}
-
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  /// Options to control the vector patterns.
-  vector::VectorTransformsOptions vectorTransformOptions;
-  FilterConstraintType filter;
-};
-
-/// Progressive lowering of ContractionOp.
-///
-/// One:
-///   %x = vector.contract with at least one free/batch dimension
-/// is replaced by:
-///   %a = vector.contract with one less free/batch dimension
-///   %b = vector.contract with one less free/batch dimension
-///   ..
-///   %x = combine %a %b ..
-/// until a pure contraction is reached (no free/batch dimensions),
-/// which is replaced by a dot-product.
-///
-/// This only kicks in when either VectorTransformsOptions is set
-/// to Dot or when other contraction patterns fail.
-class ContractionOpLowering : public OpRewritePattern<vector::ContractionOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-  using FilterConstraintType =
-      std::function<LogicalResult(vector::ContractionOp op)>;
-
-  static LogicalResult defaultFilter(vector::ContractionOp op) {
-    return success();
-  }
-
-  ContractionOpLowering(vector::VectorTransformsOptions vectorTransformOptions,
-                        MLIRContext *context, PatternBenefit benefit = 1,
-                        FilterConstraintType constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
-        vectorTransformOptions(vectorTransformOptions),
-        filter(std::move(constraint)) {}
-
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  /// Options to control the vector patterns.
-  vector::VectorTransformsOptions vectorTransformOptions;
-  FilterConstraintType filter;
-  // Lower one parallel dimension.
-  FailureOr<Value> lowerParallel(PatternRewriter &rewriter,
-                                 vector::ContractionOp op, int64_t lhsIndex,
-                                 int64_t rhsIndex, Value mask) const;
-  // Lower one reduction dimension.
-  FailureOr<Value> lowerReduction(PatternRewriter &rewriter,
-                                  vector::ContractionOp op, Value mask) const;
-};
-
 } // namespace vector
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
index 947911f9a3841..52a4c9cc368d8 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
@@ -24,17 +24,53 @@ class IfOp;
 
 namespace vector {
 
+//===----------------------------------------------------------------------===//
+// Vector transformation options exposed as auxiliary structs.
+//===----------------------------------------------------------------------===//
+/// Structure to control the behavior of vector transform patterns.
+struct VectorTransformsOptions {
+  /// Option to control the lowering of vector.contract.
+  VectorContractLowering vectorContractLowering = VectorContractLowering::Dot;
+  VectorTransformsOptions &
+  setVectorTransformsOptions(VectorContractLowering opt) {
+    vectorContractLowering = opt;
+    return *this;
+  }
+  /// Option to control the lowering of vector.multi_reduction.
+  VectorMultiReductionLowering vectorMultiReductionLowering =
+      VectorMultiReductionLowering::InnerParallel;
+  VectorTransformsOptions &
+  setVectorMultiReductionLowering(VectorMultiReductionLowering opt) {
+    vectorMultiReductionLowering = opt;
+    return *this;
+  }
+  /// Option to control the lowering of vector.transpose.
+  VectorTransposeLowering vectorTransposeLowering =
+      VectorTransposeLowering::EltWise;
+  VectorTransformsOptions &
+  setVectorTransposeLowering(VectorTransposeLowering opt) {
+    vectorTransposeLowering = opt;
+    return *this;
+  }
+  /// Option to control the splitting of vector transfers.
+  VectorTransferSplit vectorTransferSplit = VectorTransferSplit::None;
+  VectorTransformsOptions &setVectorTransferSplit(VectorTransferSplit opt) {
+    vectorTransferSplit = opt;
+    return *this;
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Standalone transformations and helpers.
 //===----------------------------------------------------------------------===//
-/// Split a vector.transfer operation into an in-bounds (i.e., no out-of-bounds
-/// masking) fastpath and a slowpath.
-/// If `ifOp` is not null and the result is `success, the `ifOp` points to the
-/// newly created conditional upon function return.
-/// To accomodate for the fact that the original vector.transfer indexing may be
-/// arbitrary and the slow path indexes @[0...0] in the temporary buffer, the
-/// scf.if op returns a view and values of type index.
-/// At this time, only vector.transfer_read case is implemented.
+/// Split a vector.transfer operation into an in-bounds (i.e., no
+/// out-of-bounds masking) fastpath and a slowpath. If `ifOp` is not null and
+/// the result is `success, the `ifOp` points to the newly created conditional
+/// upon function return. To accomodate for the fact that the original
+/// vector.transfer indexing may be arbitrary and the slow path indexes
+/// @[0...0] in the temporary buffer, the scf.if op returns a view and values
+/// of type index. At this time, only vector.transfer_read case is
+/// implemented.
 ///
 /// Example (a 2-D vector.transfer_read):
 /// ```
@@ -51,15 +87,16 @@ namespace vector {
 ///      memref.cast %alloc: memref<B...> to compatibleMemRefType
 ///      scf.yield %4 : compatibleMemRefType, index, index
 //     }
-///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ... true]}
+///    %0 = vector.transfer_read %1#0[%1#1, %1#2] {in_bounds = [true ...
+///    true]}
 /// ```
 /// where `alloc` is a top of the function alloca'ed buffer of one vector.
 ///
 /// Preconditions:
 ///  1. `xferOp.permutation_map()` must be a minor identity map
-///  2. the rank of the `xferOp.memref()` and the rank of the `xferOp.vector()`
-///  must be equal. This will be relaxed in the future but requires
-///  rank-reducing subviews.
+///  2. the rank of the `xferOp.memref()` and the rank of the
+///  `xferOp.vector()` must be equal. This will be relaxed in the future but
+///  requires rank-reducing subviews.
 LogicalResult splitFullAndPartialTransfer(
     RewriterBase &b, VectorTransferOpInterface xferOp,
     VectorTransformsOptions options = VectorTransformsOptions(),
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index c56d03f6f31d7..05def0f45d7fb 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/Interfaces/MaskableOpInterface.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
index fb544df18324b..3f1b107f6f8e0 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"
@@ -64,10 +65,11 @@ void LowerVectorToLLVMPass::runOnOperation() {
     RewritePatternSet patterns(&getContext());
     populateVectorToVectorCanonicalizationPatterns(patterns);
     populateVectorBroadcastLoweringPatterns(patterns);
-    populateVectorContractLoweringPatterns(patterns);
+    populateVectorContractLoweringPatterns(patterns, VectorTransformsOptions());
     populateVectorMaskOpLoweringPatterns(patterns);
     populateVectorShapeCastLoweringPatterns(patterns);
-    populateVectorTransposeLoweringPatterns(patterns);
+    populateVectorTransposeLoweringPatterns(patterns,
+                                            VectorTransformsOptions());
     // Vector transfer ops with rank > 1 should be lowered with VectorToSCF.
     populateVectorTransferLoweringPatterns(patterns, /*maxTransferRank=*/1);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index d8070b34a761d..ec2e2aa4c0624 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <type_traits>
 #include <optional>
+#include <type_traits>
 
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt
index eb97c6e168e5c..b7d9812ada0b1 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt
@@ -20,5 +20,5 @@ add_mlir_dialect_library(MLIRLinalgTransformOps
   MLIRSideEffectInterfaces
   MLIRTransformDialect
   MLIRTransformDialectUtils
-  MLIRVectorDialect
+  MLIRVectorTransforms
   )
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index d98eb3b781fc5..e3c1429ade54a 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/Dialect/Transform/Utils/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
diff --git a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
index 60996b9add614..136d234742b8d 100644
--- a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
+++ b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
@@ -7,13 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
-
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/PDL/IR/PDL.h"
 #include "mlir/Dialect/PDL/IR/PDLTypes.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
 #include "mlir/Parser/Parser.h"
@@ -82,10 +83,9 @@ DiagnosedSilenceableFailure transform::LowerVectorsOp::apply(
 
     // In the future we may want to more finely select particular stages.
     // Stage 1: contraction lowerings.
-    patterns.add<mlir::vector::ContractionOpToOuterProductOpLowering,
-                 mlir::vector::ContractionOpToMatmulOpLowering,
-                 mlir::vector::ContractionOpLowering>(vectorTransformOptions,
-                                                      ctx);
+    populateVectorContractLoweringPatterns(
+        patterns, vectorTransformOptions, /*benefit=*/1,
+        /*disableOuterProductLowering*/ true);
     vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
 
     // Stage 2: multi-reduction lowerings.
@@ -93,8 +93,7 @@ DiagnosedSilenceableFailure transform::LowerVectorsOp::apply(
         patterns, vectorTransformOptions.vectorMultiReductionLowering);
 
     // Stage 3: Rewrite vector.transfer into full and partial parts.
-    patterns.add<vector::VectorTransferFullPartialRewriter>(
-        ctx, vectorTransformOptions);
+    populateVectorTransferFullPartialPatterns(patterns, vectorTransformOptions);
 
     // Stage 4: Lower vector transfers.
     vector::populateVectorTransferLoweringPatterns(patterns, maxTransferRank);
@@ -107,8 +106,8 @@ DiagnosedSilenceableFailure transform::LowerVectorsOp::apply(
     vector::populateVectorShapeCastLoweringPatterns(patterns);
 
     // Stage 7: Lower vector.transpose.
-    vector::populateVectorTransposeLoweringPatterns(patterns,
-                                                    vectorTransformOptions);
+    vector::populateVectorTransposeLoweringPatterns(
+        patterns, vectorTransformOptions, /*benefit=*/1);
     if (getTransposeAvx2Lowering())
       x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
           patterns, avx2LoweringOptions, /*benefit=*/10);
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index 6fb1b8c18a122..f17208e193b3c 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -1,14 +1,20 @@
 add_mlir_dialect_library(MLIRVectorTransforms
   BufferizableOpInterfaceImpl.cpp
   Bufferize.cpp
+  LowerVectorBroadcast.cpp
+  LowerVectorContract.cpp
+  LowerVectorGather.cpp
   LowerVectorMask.cpp
+  LowerVectorMultiReduction.cpp
+  LowerVectorScan.cpp
+  LowerVectorShapeCast.cpp
+  LowerVectorTransfer.cpp
+  LowerVectorTranspose.cpp
   VectorDistribute.cpp
   VectorDropLeadUnitDim.cpp
   VectorInsertExtractStridedSliceRewritePatterns.cpp
-  VectorMultiDimReductionTransforms.cpp
   VectorTransferOpTransforms.cpp
   VectorTransferSplitRewritePatterns.cpp
-  VectorTransferPermutationMapRewritePatterns.cpp
   VectorTransforms.cpp
   VectorUnroll.cpp
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
new file mode 100644
index 0000000000000..ad538fe4a6828
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
@@ -0,0 +1,156 @@
+//===- LowerVectorBroadcast.cpp - Lower 'vector.broadcast' operation ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites and utilities to lower the
+// 'vector.broadcast' operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/VectorInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+
+#define DEBUG_TYPE "vector-broadcast-lowering"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+namespace {
+/// Progressive lowering of BroadcastOp.
+class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::BroadcastOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    VectorType dstType = op.getResultVectorType();
+    VectorType srcType = op.getSourceType().dyn_cast<VectorType>();
+    Type eltType = dstType.getElementType();
+
+    // Scalar to any vector can use splat.
+    if (!srcType) {
+      rewriter.replaceOpWithNewOp<vector::SplatOp>(op, dstType, op.getSource());
+      return success();
+    }
+
+    // Determine rank of source and destination.
+    int64_t srcRank = srcType.getRank();
+    int64_t dstRank = dstType.getRank();
+
+    // Stretching scalar inside vector (e.g. vector<1xf32>) can use splat.
+    if (srcRank <= 1 && dstRank == 1) {
+      Value ext;
+      if (srcRank == 0)
+        ext = rewriter.create<vector::ExtractElementOp>(loc, op.getSource());
+      else
+        ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), 0);
+      rewriter.replaceOpWithNewOp<vector::SplatOp>(op, dstType, ext);
+      return success();
+    }
+
+    // Duplicate this rank.
+    // For example:
+    //   %x = broadcast %y  : k-D to n-D, k < n
+    // becomes:
+    //   %b = broadcast %y  : k-D to (n-1)-D
+    //   %x = [%b,%b,%b,%b] : n-D
+    // becomes:
+    //   %b = [%y,%y]       : (n-1)-D
+    //   %x = [%b,%b,%b,%b] : n-D
+    if (srcRank < dstRank) {
+      // Duplication.
+      VectorType resType =
+          VectorType::get(dstType.getShape().drop_front(), eltType);
+      Value bcst =
+          rewriter.create<vector::BroadcastOp>(loc, resType, op.getSource());
+      Value result = rewriter.create<arith::ConstantOp>(
+          loc, dstType, rewriter.getZeroAttr(dstType));
+      for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d)
+        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
+      rewriter.replaceOp(op, result);
+      return success();
+    }
+
+    // Find non-matching dimension, if any.
+    assert(srcRank == dstRank);
+    int64_t m = -1;
+    for (int64_t r = 0; r < dstRank; r++)
+      if (srcType.getDimSize(r) != dstType.getDimSize(r)) {
+        m = r;
+        break;
+      }
+
+    // All trailing dimensions are the same. Simply pass through.
+    if (m == -1) {
+      rewriter.replaceOp(op, op.getSource());
+      return success();
+    }
+
+    // Any non-matching dimension forces a stretch along this rank.
+    // For example:
+    //   %x = broadcast %y : vector<4x1x2xf32> to vector<4x2x2xf32>
+    // becomes:
+    //   %a = broadcast %y[0] : vector<1x2xf32> to vector<2x2xf32>
+    //   %b = broadcast %y[1] : vector<1x2xf32> to vector<2x2xf32>
+    //   %c = broadcast %y[2] : vector<1x2xf32> to vector<2x2xf32>
+    //   %d = broadcast %y[3] : vector<1x2xf32> to vector<2x2xf32>
+    //   %x = [%a,%b,%c,%d]
+    // becomes:
+    //   %u = broadcast %y[0][0] : vector<2xf32> to vector <2x2xf32>
+    //   %v = broadcast %y[1][0] : vector<2xf32> to vector <2x2xf32>
+    //   %a = [%u, %v]
+    //   ..
+    //   %x = [%a,%b,%c,%d]
+    VectorType resType =
+        VectorType::get(dstType.getShape().drop_front(), eltType);
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, dstType, rewriter.getZeroAttr(dstType));
+    if (m == 0) {
+      // Stetch at start.
+      Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), 0);
+      Value bcst = rewriter.create<vector::BroadcastOp>(loc, resType, ext);
+      for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d)
+        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
+    } else {
+      // Stetch not at start.
+      for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d) {
+        Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), d);
+        Value bcst = rewriter.create<vector::BroadcastOp>(loc, resType, ext);
+        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
+      }
+    }
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+} // namespace
+
+void mlir::vector::populateVectorBroadcastLoweringPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<BroadcastOpLowering>(patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
new file mode 100644
index 0000000000000..1280cfef0b645
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
@@ -0,0 +1,1329 @@
+//===- LowerVectorContract.cpp - Lower 'vector.contract' operation --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites and utilities to lower the
+// 'vector.contract' operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/VectorInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+
+#define DEBUG_TYPE "vector-contract-lowering"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+// Helper to find an index in an affine map.
+static std::optional<int64_t> getResultIndex(AffineMap map, int64_t index) {
+  for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
+    int64_t idx = map.getDimPosition(i);
+    if (idx == index)
+      return i;
+  }
+  return std::nullopt;
+}
+
+// Helper to construct iterator types with one index removed.
+static SmallVector<Attribute> adjustIter(ArrayAttr iteratorTypes,
+                                         int64_t index) {
+  SmallVector<Attribute> results;
+  for (const auto &it : llvm::enumerate(iteratorTypes)) {
+    int64_t idx = it.index();
+    if (idx == index)
+      continue;
+    results.push_back(it.value());
+  }
+  return results;
+}
+
+// Helper to construct an affine map with one index removed.
+static AffineMap adjustMap(AffineMap map, int64_t index,
+                           PatternRewriter &rewriter) {
+  auto *ctx = rewriter.getContext();
+  SmallVector<AffineExpr> results;
+  for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
+    int64_t idx = map.getDimPosition(i);
+    if (idx == index)
+      continue;
+    // Re-insert remaining indices, but renamed when occurring
+    // after the removed index.
+    auto targetExpr = getAffineDimExpr(idx < index ? idx : idx - 1, ctx);
+    results.push_back(targetExpr);
+  }
+  return AffineMap::get(map.getNumDims() - 1, 0, results, ctx);
+}
+
+// Helper method to possibly drop a dimension in a load.
+// TODO
+static Value reshapeLoad(Location loc, Value val, VectorType type,
+                         int64_t index, int64_t pos,
+                         PatternRewriter &rewriter) {
+  if (index == -1)
+    return val;
+  Type lowType = VectorType::Builder(type).dropDim(0);
+  // At extraction dimension?
+  if (index == 0) {
+    auto posAttr = rewriter.getI64ArrayAttr(pos);
+    return rewriter.create<vector::ExtractOp>(loc, lowType, val, posAttr);
+  }
+  // Unroll leading dimensions.
+  VectorType vType = lowType.cast<VectorType>();
+  Type resType = VectorType::Builder(type).dropDim(index);
+  auto resVectorType = resType.cast<VectorType>();
+  Value result = rewriter.create<arith::ConstantOp>(
+      loc, resVectorType, rewriter.getZeroAttr(resVectorType));
+  for (int64_t d = 0, e = resVectorType.getDimSize(0); d < e; d++) {
+    auto posAttr = rewriter.getI64ArrayAttr(d);
+    Value ext = rewriter.create<vector::ExtractOp>(loc, vType, val, posAttr);
+    Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter);
+    result = rewriter.create<vector::InsertOp>(loc, resVectorType, load, result,
+                                               posAttr);
+  }
+  return result;
+}
+
+// Helper method to possibly drop a dimension in a store.
+// TODO
+static Value reshapeStore(Location loc, Value val, Value result,
+                          VectorType type, int64_t index, int64_t pos,
+                          PatternRewriter &rewriter) {
+  // Unmodified?
+  if (index == -1)
+    return val;
+  // At insertion dimension?
+  if (index == 0) {
+    auto posAttr = rewriter.getI64ArrayAttr(pos);
+    return rewriter.create<vector::InsertOp>(loc, type, val, result, posAttr);
+  }
+  // Unroll leading dimensions.
+  Type lowType = VectorType::Builder(type).dropDim(0);
+  VectorType vType = lowType.cast<VectorType>();
+  Type insType = VectorType::Builder(vType).dropDim(0);
+  for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) {
+    auto posAttr = rewriter.getI64ArrayAttr(d);
+    Value ext = rewriter.create<vector::ExtractOp>(loc, vType, result, posAttr);
+    Value ins = rewriter.create<vector::ExtractOp>(loc, insType, val, posAttr);
+    Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter);
+    result = rewriter.create<vector::InsertOp>(loc, type, sto, result, posAttr);
+  }
+  return result;
+}
+
+/// Helper to create arithmetic operation associated with a kind of contraction.
+static std::optional<Value>
+createContractArithOp(Location loc, Value x, Value y, Value acc,
+                      vector::CombiningKind kind, PatternRewriter &rewriter,
+                      bool isInt, Value mask = Value()) {
+  using vector::CombiningKind;
+  Value mul;
+
+  if (isInt) {
+    if (kind == CombiningKind::MINF || kind == CombiningKind::MAXF)
+      // Only valid for floating point types.
+      return std::nullopt;
+    mul = rewriter.create<arith::MulIOp>(loc, x, y);
+  } else {
+    // Float case.
+    if (kind == CombiningKind::AND || kind == CombiningKind::MINUI ||
+        kind == CombiningKind::MINSI || kind == CombiningKind::MAXUI ||
+        kind == CombiningKind::MAXSI || kind == CombiningKind::OR ||
+        kind == CombiningKind::XOR)
+      // Only valid for integer types.
+      return std::nullopt;
+    // Special case for fused multiply-add.
+    if (acc && acc.getType().isa<VectorType>() && kind == CombiningKind::ADD) {
+      Value fma = rewriter.create<vector::FMAOp>(loc, x, y, acc);
+      if (mask)
+        // The fma op doesn't need explicit masking. However, fma ops used in
+        // reductions must preserve previous 'acc' values for masked-out lanes.
+        fma = selectPassthru(rewriter, mask, fma, acc);
+      return fma;
+    }
+    mul = rewriter.create<arith::MulFOp>(loc, x, y);
+  }
+
+  if (!acc)
+    return std::optional<Value>(mul);
+
+  return makeArithReduction(rewriter, loc, kind, mul, acc, mask);
+}
+
+/// Return the positions of the reductions in the given map.
+static SmallVector<int64_t> getReductionIndex(AffineMap map,
+                                              ArrayAttr iteratorTypes) {
+  SmallVector<int64_t> dimsIdx;
+  for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
+    if (isReductionIterator(iteratorTypes[map.getDimPosition(i)]))
+      dimsIdx.push_back(i);
+  }
+  return dimsIdx;
+}
+
+/// Look for a given dimension in an affine map and return its position. Return
+/// std::nullopt if the dimension is not in the map results.
+static std::optional<unsigned> getDimPosition(AffineMap map, unsigned dim) {
+  for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
+    if (map.getDimPosition(i) == dim)
+      return i;
+  }
+  return std::nullopt;
+}
+
+/// Creates an AddIOp if `isInt` is true otherwise create an arith::AddFOp using
+/// operands `x` and `y`.
+static Value createAdd(Location loc, Value x, Value y, bool isInt,
+                       PatternRewriter &rewriter) {
+  if (isInt)
+    return rewriter.create<arith::AddIOp>(loc, x, y);
+  return rewriter.create<arith::AddFOp>(loc, x, y);
+}
+
+/// Creates a MulIOp if `isInt` is true otherwise create an MulFOp using
+/// operands `x and `y`.
+static Value createMul(Location loc, Value x, Value y, bool isInt,
+                       PatternRewriter &rewriter) {
+  if (isInt)
+    return rewriter.create<arith::MulIOp>(loc, x, y);
+  return rewriter.create<arith::MulFOp>(loc, x, y);
+}
+
+namespace {
+
+/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to:
+/// ```
+///    %flattened_a = vector.shape_cast %a
+///    %flattened_b = vector.shape_cast %b
+///    %flattened_d = vector.matmul %flattened_a, %flattened_b
+///    %d = vector.shape_cast %%flattened_d
+///    %e = add %c, %d
+/// ```
+/// `vector.matmul` later lowers to `llvm.matrix.multiply`.
+//
+/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
+/// the vector.contract op is a row-major matrix multiply.
+class ContractionOpToMatmulOpLowering
+    : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  using FilterConstraintType =
+      std::function<LogicalResult(vector::ContractionOp op)>;
+
+  static LogicalResult defaultFilter(vector::ContractionOp op) {
+    return success();
+  }
+
+  ContractionOpToMatmulOpLowering(
+      vector::VectorTransformsOptions vectorTransformOptions,
+      MLIRContext *context, PatternBenefit benefit = 1,
+      FilterConstraintType constraint = defaultFilter)
+      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+        vectorTransformOptions(vectorTransformOptions),
+        filter(std::move(constraint)) {}
+
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformOptions;
+  FilterConstraintType filter;
+};
+
+/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to a reduction_size-unrolled sequence:
+/// ```
+///    %at = vector.transpose %a, [1, 0]
+///    %bRow0 = vector.extract %b[0]
+///    %atRow0 = vector.extract %at[0]
+///    %c0 = vector.outerproduct %atRow0, %bRow0, %c
+///    ...
+///    %bRowK = vector.extract %b[K]
+///    %atRowK = vector.extract %at[K]
+///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1
+/// ```
+///
+/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
+/// the vector.contract op is a row-major matrix multiply.
+class ContractionOpToOuterProductOpLowering
+    : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  using FilterConstraintType =
+      std::function<LogicalResult(vector::ContractionOp op)>;
+
+  static LogicalResult defaultFilter(vector::ContractionOp op) {
+    return success();
+  }
+
+  ContractionOpToOuterProductOpLowering(
+      vector::VectorTransformsOptions vectorTransformOptions,
+      MLIRContext *context, PatternBenefit benefit = 1,
+      FilterConstraintType constraint = defaultFilter)
+      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+        vectorTransformOptions(vectorTransformOptions),
+        filter(std::move(constraint)) {}
+
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformOptions;
+  FilterConstraintType filter;
+};
+
+/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to an output-size-unrolled sequence:
+/// ```
+///    %out = arith.constant ... : vector<MxNxelt_type>
+///    %bt = vector.transpose %b, [1, 0]
+///    %aRow0 = vector.extract %a[0]
+///    %btRow0 = vector.extract %bt[0]
+///    %c00 = vector.reduce %atRow0, %bRow0
+///    %out00 = vector.insert %c00, %out[0, 0]
+///    ...
+///    %aRowLast = vector.extract %at[M-1]
+///    %btRowLast = vector.extract %b[N-1]
+///    %cLastLast = vector.reduce %atRowLast, %bRowLast
+///    %outcLastLast = vector.insert %cLastLast, %out[M-1, N-1]
+/// ```
+///
+/// This only kicks in when VectorTransformsOptions is set to Dot and
+/// the vector.contract op is a row-major matmul or matvec.
+class ContractionOpToDotLowering
+    : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  using FilterConstraintType =
+      std::function<LogicalResult(vector::ContractionOp op)>;
+
+  static LogicalResult defaultFilter(vector::ContractionOp op) {
+    return success();
+  }
+
+  ContractionOpToDotLowering(
+      vector::VectorTransformsOptions vectorTransformOptions,
+      MLIRContext *context, PatternBenefit benefit = 1,
+      const FilterConstraintType &constraint = defaultFilter)
+      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+        vectorTransformOptions(vectorTransformOptions), filter(defaultFilter) {}
+
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformOptions;
+  FilterConstraintType filter;
+};
+
+/// Progressive lowering of ContractionOp.
+///
+/// One:
+///   %x = vector.contract with at least one free/batch dimension
+/// is replaced by:
+///   %a = vector.contract with one less free/batch dimension
+///   %b = vector.contract with one less free/batch dimension
+///   ..
+///   %x = combine %a %b ..
+/// until a pure contraction is reached (no free/batch dimensions),
+/// which is replaced by a dot-product.
+///
+/// This only kicks in when either VectorTransformsOptions is set
+/// to Dot or when other contraction patterns fail.
+class ContractionOpLowering : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+  using FilterConstraintType =
+      std::function<LogicalResult(vector::ContractionOp op)>;
+
+  static LogicalResult defaultFilter(vector::ContractionOp op) {
+    return success();
+  }
+
+  ContractionOpLowering(vector::VectorTransformsOptions vectorTransformOptions,
+                        MLIRContext *context, PatternBenefit benefit = 1,
+                        FilterConstraintType constraint = defaultFilter)
+      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+        vectorTransformOptions(vectorTransformOptions),
+        filter(std::move(constraint)) {}
+
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformOptions;
+  FilterConstraintType filter;
+  // Lower one parallel dimension.
+  FailureOr<Value> lowerParallel(PatternRewriter &rewriter,
+                                 vector::ContractionOp op, int64_t lhsIndex,
+                                 int64_t rhsIndex, Value mask) const;
+  // Lower one reduction dimension.
+  FailureOr<Value> lowerReduction(PatternRewriter &rewriter,
+                                  vector::ContractionOp op, Value mask) const;
+};
+
+/// Generate a vector implementation for matmat, matvec and tmatvec.
+/// This unrolls outer-products along the reduction dimension.
+struct UnrolledOuterProductGenerator
+    : public StructuredGenerator<vector::ContractionOp, vector::IteratorType> {
+  UnrolledOuterProductGenerator(RewriterBase &b, vector::ContractionOp op)
+      : StructuredGenerator<vector::ContractionOp, vector::IteratorType>(b, op),
+        kind(op.getKind()), lhs(op.getLhs()), rhs(op.getRhs()),
+        res(op.getAcc()), lhsType(op.getLhsType()) {
+    auto maskableOp = cast<MaskableOpInterface>(op.getOperation());
+    if (maskableOp.isMasked())
+      mask = maskableOp.getMaskingOp().getMask();
+  }
+
+  Value t(Value v, ArrayRef<int64_t> perm = {1, 0}) {
+    if (!v)
+      return v;
+    return rewriter.create<vector::TransposeOp>(loc, v, perm);
+  }
+
+  Value promote(Value v, Type dstElementType) {
+    Type elementType = v.getType();
+    auto vecType = elementType.dyn_cast<VectorType>();
+    if (vecType)
+      elementType = vecType.getElementType();
+    if (elementType == dstElementType)
+      return v;
+    Type promotedType = dstElementType;
+    if (vecType)
+      promotedType = VectorType::get(vecType.getShape(), promotedType);
+    if (dstElementType.isa<FloatType>())
+      return rewriter.create<arith::ExtFOp>(loc, promotedType, v);
+    return rewriter.create<arith::ExtSIOp>(loc, promotedType, v);
+  }
+
+  FailureOr<Value> outerProd(Value lhs, Value rhs, Value res, int reductionSize,
+                             std::optional<Value> maybeMask = std::nullopt) {
+    assert(reductionSize > 0);
+    // Incremental support for masking.
+    if (mask && !maybeMask.has_value())
+      return failure();
+
+    Type resElementType = res.getType().cast<VectorType>().getElementType();
+    for (int64_t k = 0; k < reductionSize; ++k) {
+      Value extractA = rewriter.create<vector::ExtractOp>(loc, lhs, k);
+      Value extractB = rewriter.create<vector::ExtractOp>(loc, rhs, k);
+      extractA = promote(extractA, resElementType);
+      extractB = promote(extractB, resElementType);
+      Value extractMask;
+      if (maybeMask.has_value() && maybeMask.value())
+        extractMask =
+            rewriter.create<vector::ExtractOp>(loc, maybeMask.value(), k);
+
+      Operation *outerProdOp = rewriter.create<vector::OuterProductOp>(
+          loc, res.getType(), extractA, extractB, res, kind);
+      res = maskOperation(rewriter, outerProdOp, extractMask)->getResult(0);
+    }
+    return res;
+  }
+
+  /// Two outer parallel, one inner reduction (matmat flavor).
+  FailureOr<Value> matmat() {
+    if (!iters({Par(), Par(), Red()}))
+      return failure();
+    // Set up the parallel/reduction structure in the right form.
+    AffineExpr m, n, k;
+    bindDims(rewriter.getContext(), m, n, k);
+    // Classical row-major matmul:  Just permute the lhs.
+    if (layout({{m, k}, {k, n}, {m, n}}))
+      return outerProd(t(lhs), rhs, res, lhsType.getDimSize(1),
+                       t(mask, {2, 0, 1}));
+    // TODO: may be better to fail and use some vector<k> -> scalar reduction.
+    if (layout({{m, k}, {n, k}, {m, n}})) {
+      Value tlhs = t(lhs);
+      return outerProd(tlhs, t(rhs), res, lhsType.getDimSize(1));
+    }
+    // No need to permute anything.
+    if (layout({{k, m}, {k, n}, {m, n}}))
+      return outerProd(lhs, rhs, res, lhsType.getDimSize(0));
+    // Just permute the rhs.
+    if (layout({{k, m}, {n, k}, {m, n}}))
+      return outerProd(lhs, t(rhs), res, lhsType.getDimSize(0));
+    // Transposed output: swap RHS and LHS.
+    // Classical row-major matmul: permute the lhs.
+    if (layout({{m, k}, {k, n}, {n, m}}))
+      return outerProd(rhs, t(lhs), res, lhsType.getDimSize(1));
+    // TODO: may be better to fail and use some vector<k> -> scalar reduction.
+    if (layout({{m, k}, {n, k}, {n, m}})) {
+      Value trhs = t(rhs);
+      return outerProd(trhs, t(lhs), res, lhsType.getDimSize(1));
+    }
+    if (layout({{k, m}, {k, n}, {n, m}}))
+      return outerProd(rhs, lhs, res, lhsType.getDimSize(0));
+    if (layout({{k, m}, {n, k}, {n, m}}))
+      return outerProd(t(rhs), lhs, res, lhsType.getDimSize(0));
+    return failure();
+  }
+
+  /// One outer parallel, one inner reduction (matvec flavor)
+  FailureOr<Value> matvec() {
+    if (!iters({Par(), Red()}))
+      return failure();
+    AffineExpr m, k;
+    bindDims(rewriter.getContext(), m, k);
+
+    // Case mat-vec: transpose.
+    if (layout({{m, k}, {k}, {m}}))
+      return outerProd(t(lhs), rhs, res, lhsType.getDimSize(1), t(mask));
+    // Case mat-trans-vec: ready to go.
+    if (layout({{k, m}, {k}, {m}}))
+      return outerProd(lhs, rhs, res, lhsType.getDimSize(0));
+    // Case vec-mat: swap and transpose.
+    if (layout({{k}, {m, k}, {m}}))
+      return outerProd(t(rhs), lhs, res, lhsType.getDimSize(0));
+    // Case vec-mat-trans: swap and ready to go.
+    if (layout({{k}, {k, m}, {m}}))
+      return outerProd(rhs, lhs, res, lhsType.getDimSize(0));
+    return failure();
+  }
+
+  //
+  // One outer reduction, one inner parallel (tmatvec flavor)
+  //
+  FailureOr<Value> tmatvec() {
+    if (!iters({Red(), Par()}))
+      return failure();
+    AffineExpr k, m;
+    bindDims(rewriter.getContext(), k, m);
+
+    // Case mat-vec: transpose.
+    if (layout({{m, k}, {k}, {m}}))
+      return outerProd(t(lhs), rhs, res, lhsType.getDimSize(1));
+    // Case mat-trans-vec: ready to go.
+    if (layout({{k, m}, {k}, {m}}))
+      return outerProd(lhs, rhs, res, lhsType.getDimSize(0));
+    // Case vec-mat: swap and transpose.
+    if (layout({{k}, {m, k}, {m}}))
+      return outerProd(t(rhs), lhs, res, lhsType.getDimSize(0));
+    // Case vec-mat-trans: swap and ready to go.
+    if (layout({{k}, {k, m}, {m}}))
+      return outerProd(rhs, lhs, res, lhsType.getDimSize(0));
+    return failure();
+  }
+
+private:
+  vector::CombiningKind kind;
+  Value lhs, rhs, res, mask;
+  VectorType lhsType;
+};
+
+/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to a reduction_size-unrolled sequence:
+/// ```
+///    %at = vector.transpose %a, [1, 0]
+///    %bRow0 = vector.extract %b[0]
+///    %atRow0 = vector.extract %at[0]
+///    %c0 = vector.outerproduct %atRow0, %bRow0, %c
+///    ...
+///    %bRowK = vector.extract %b[K]
+///    %atRowK = vector.extract %at[K]
+///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1
+/// ```
+///
+/// This only kicks in when VectorTransformsOptions is set to OuterProduct but
+/// otherwise supports any layout permutation of the matrix-multiply.
+LogicalResult ContractionOpToOuterProductOpLowering::matchAndRewrite(
+    vector::ContractionOp op, PatternRewriter &rewriter) const {
+  // TODO: Remove native masks from contraction op?
+  if (!op.getMasks().empty())
+    return failure();
+
+  if (vectorTransformOptions.vectorContractLowering !=
+      vector::VectorContractLowering::OuterProduct)
+    return failure();
+
+  if (failed(filter(op)))
+    return failure();
+
+  // Vector mask setup.
+  OpBuilder::InsertionGuard guard(rewriter);
+  auto maskableOp = cast<vector::MaskableOpInterface>(op.getOperation());
+  Operation *rootOp;
+  if (maskableOp.isMasked()) {
+    rewriter.setInsertionPoint(maskableOp.getMaskingOp());
+    rootOp = maskableOp.getMaskingOp();
+  } else {
+    rootOp = op;
+  }
+
+  UnrolledOuterProductGenerator e(rewriter, op);
+  FailureOr<Value> matmatRes = e.matmat();
+  if (succeeded(matmatRes)) {
+    rewriter.replaceOp(rootOp, *matmatRes);
+    return success();
+  }
+  FailureOr<Value> matvecRes = e.matvec();
+  if (succeeded(matvecRes)) {
+    rewriter.replaceOp(rootOp, *matvecRes);
+    return success();
+  }
+  FailureOr<Value> tmatvecRes = e.tmatvec();
+  if (succeeded(tmatvecRes)) {
+    rewriter.replaceOp(rootOp, *tmatvecRes);
+    return success();
+  }
+
+  return failure();
+}
+
+LogicalResult
+ContractionOpToDotLowering::matchAndRewrite(vector::ContractionOp op,
+                                            PatternRewriter &rewriter) const {
+  // TODO: Support vector.mask.
+  auto maskableOp = cast<MaskableOpInterface>(op.getOperation());
+  if (maskableOp.isMasked())
+    return failure();
+
+  // TODO: Remove native masks from contraction op?
+  if (!op.getMasks().empty())
+    return failure();
+
+  if (failed(filter(op)))
+    return failure();
+
+  if (vectorTransformOptions.vectorContractLowering !=
+      vector::VectorContractLowering::Dot)
+    return failure();
+
+  auto iteratorTypes = op.getIteratorTypes().getValue();
+  static constexpr std::array<int64_t, 2> perm = {1, 0};
+  Location loc = op.getLoc();
+  Value lhs = op.getLhs(), rhs = op.getRhs();
+
+  using MapList = ArrayRef<ArrayRef<AffineExpr>>;
+  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+  AffineExpr m, n, k;
+  bindDims(rewriter.getContext(), m, n, k);
+  SmallVector<AffineMap> maps = op.getIndexingMapsArray();
+  //
+  // In the following we wish to make the reduction dimension innermost so we
+  // can load vectors and just fmul + reduce into a scalar.
+  //
+  if (isParallelIterator(iteratorTypes[0]) &&
+      isParallelIterator(iteratorTypes[1]) &&
+      isReductionIterator(iteratorTypes[2])) {
+    //
+    // Two outer parallel, one inner reduction (matmat flavor).
+    //
+    if (maps == infer({{m, k}, {k, n}, {m, n}})) {
+      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+    } else if (maps == infer({{m, k}, {n, k}, {m, n}})) {
+      // No need to permute anything.
+    } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
+      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+    } else if (maps == infer({{k, m}, {n, k}, {m, n}})) {
+      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+    } else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
+      // This is the classical row-major matmul. Just permute the lhs.
+      Value tmp = lhs;
+      lhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+      rhs = tmp;
+    } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
+      std::swap(lhs, rhs);
+    } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
+      Value tmp = lhs;
+      lhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+      rhs = rewriter.create<vector::TransposeOp>(loc, tmp, perm);
+    } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
+      Value tmp = rhs;
+      rhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      lhs = tmp;
+    } else {
+      return failure();
+    }
+  } else if (isParallelIterator(iteratorTypes[0]) &&
+             isReductionIterator(iteratorTypes[1])) {
+    //
+    // One outer parallel, one inner reduction (matvec flavor)
+    //
+    if (maps == infer({{m, n}, {n}, {m}})) {
+      // No need to permute anything.
+    } else if (maps == infer({{n, m}, {n}, {m}})) {
+      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+    } else if (maps == infer({{n}, {m, n}, {m}})) {
+      std::swap(lhs, rhs);
+    } else if (maps == infer({{n}, {n, m}, {m}})) {
+      std::swap(lhs, rhs);
+      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+    } else {
+      return failure();
+    }
+  } else {
+    return failure();
+  }
+
+  VectorType dstType = op.getResultType().cast<VectorType>();
+  assert(dstType.getRank() >= 1 && dstType.getRank() <= 2 &&
+         "Expected dst type of rank 1 or 2");
+
+  unsigned rank = dstType.getRank();
+  unsigned dstRows = dstType.getShape()[0];
+  unsigned dstColumns = rank == 1 ? 1 : dstType.getShape()[1];
+
+  // ExtractOp does not allow dynamic indexing, we must unroll explicitly.
+  Value res = rewriter.create<arith::ConstantOp>(loc, dstType,
+                                                 rewriter.getZeroAttr(dstType));
+  bool isInt = dstType.getElementType().isa<IntegerType>();
+  for (unsigned r = 0; r < dstRows; ++r) {
+    Value a = rewriter.create<vector::ExtractOp>(op.getLoc(), lhs, r);
+    for (unsigned c = 0; c < dstColumns; ++c) {
+      Value b = rank == 1
+                    ? rhs
+                    : rewriter.create<vector::ExtractOp>(op.getLoc(), rhs, c);
+      Value m = createMul(op.getLoc(), a, b, isInt, rewriter);
+      Value reduced = rewriter.create<vector::ReductionOp>(
+          op.getLoc(), vector::CombiningKind::ADD, m);
+
+      SmallVector<int64_t, 2> pos = rank == 1 ? SmallVector<int64_t, 2>{r}
+                                              : SmallVector<int64_t, 2>{r, c};
+      res = rewriter.create<vector::InsertOp>(op.getLoc(), reduced, res, pos);
+    }
+  }
+  if (auto acc = op.getAcc())
+    res = createAdd(op.getLoc(), res, acc, isInt, rewriter);
+  rewriter.replaceOp(op, res);
+  return success();
+}
+
+/// Lower vector.contract with all size one reduction dimensions to
+/// elementwise ops when possible.
+struct ContractOpToElementwise
+    : public OpRewritePattern<vector::ContractionOp> {
+  using OpRewritePattern::OpRewritePattern;
+  using FilterConstraintType =
+      std::function<LogicalResult(vector::ContractionOp op)>;
+  static LogicalResult defaultFilter(vector::ContractionOp op) {
+    return success();
+  }
+  ContractOpToElementwise(
+      vector::VectorTransformsOptions vectorTransformOptions,
+      MLIRContext *context, PatternBenefit benefit = 1,
+      const FilterConstraintType &constraint = defaultFilter)
+      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+        vectorTransformOptions(vectorTransformOptions), filter(defaultFilter) {}
+
+  LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
+                                PatternRewriter &rewriter) const override {
+    // TODO: Support vector.mask.
+    auto maskableOp = cast<MaskableOpInterface>(contractOp.getOperation());
+    if (maskableOp.isMasked())
+      return failure();
+
+    // TODO: Remove native masks from contraction op?
+    if (!contractOp.getMasks().empty())
+      return failure();
+
+    if (failed(filter(contractOp)))
+      return failure();
+
+    if (vectorTransformOptions.vectorContractLowering !=
+        vector::VectorContractLowering::ParallelArith)
+      return failure();
+
+    ArrayRef<int64_t> lhsShape = contractOp.getLhsType().getShape();
+    ArrayRef<int64_t> rhsShape = contractOp.getRhsType().getShape();
+    AffineMap lhsMap = contractOp.getIndexingMapsArray()[0];
+    AffineMap rhsMap = contractOp.getIndexingMapsArray()[1];
+    SmallVector<int64_t> lhsReductionDims =
+        getReductionIndex(lhsMap, contractOp.getIteratorTypes());
+    SmallVector<int64_t> rhsReductionDims =
+        getReductionIndex(rhsMap, contractOp.getIteratorTypes());
+    // All the reduction dimensions must be a size 1.
+    for (int64_t dim : lhsReductionDims) {
+      if (lhsShape[dim] != 1)
+        return failure();
+    }
+    for (int64_t dim : rhsReductionDims) {
+      if (rhsShape[dim] != 1)
+        return failure();
+    }
+    AffineMap accMap = contractOp.getIndexingMapsArray()[2];
+    unsigned numParallelDims = accMap.getNumResults();
+    unsigned numLhsDimToBroadcast =
+        numParallelDims - (lhsMap.getNumResults() - lhsReductionDims.size());
+    unsigned numRhsDimToBroadcast =
+        numParallelDims - (rhsMap.getNumResults() - rhsReductionDims.size());
+    SmallVector<int64_t> lhsDims;
+    SmallVector<int64_t> lhsTranspose;
+    SmallVector<int64_t> rhsDims;
+    SmallVector<int64_t> rhsTranspose;
+    for (int64_t dim : lhsReductionDims)
+      lhsTranspose.push_back(numLhsDimToBroadcast + dim);
+    for (int64_t dim : rhsReductionDims)
+      rhsTranspose.push_back(numRhsDimToBroadcast + dim);
+    // Loop through the parallel dimensions to calculate the dimensions to
+    // broadcast and to permute in order to extract only parallel dimensions.
+    for (unsigned i = 0; i < numParallelDims; i++) {
+      std::optional<unsigned> lhsDim =
+          getDimPosition(lhsMap, accMap.getDimPosition(i));
+      if (lhsDim) {
+        lhsTranspose.push_back(numLhsDimToBroadcast + *lhsDim);
+      } else {
+        // If the parallel dimension doesn't exist we will have to broadcast it.
+        lhsDims.push_back(
+            contractOp.getResultType().cast<VectorType>().getDimSize(i));
+        lhsTranspose.push_back(lhsDims.size() - 1);
+      }
+      std::optional<unsigned> rhsDim =
+          getDimPosition(rhsMap, accMap.getDimPosition(i));
+      if (rhsDim) {
+        rhsTranspose.push_back(numRhsDimToBroadcast + *rhsDim);
+      } else {
+        // If the parallel dimension doesn't exist we will have to broadcast it.
+        rhsDims.push_back(
+            contractOp.getResultType().cast<VectorType>().getDimSize(i));
+        rhsTranspose.push_back(rhsDims.size() - 1);
+      }
+    }
+    Value newLhs = contractOp.getLhs();
+    Value newRhs = contractOp.getRhs();
+    Location loc = contractOp.getLoc();
+    if (!lhsDims.empty()) {
+      lhsDims.append(lhsShape.begin(), lhsShape.end());
+      auto expandedType =
+          VectorType::get(lhsDims, contractOp.getLhsType().getElementType());
+      newLhs = rewriter.create<vector::BroadcastOp>(loc, expandedType, newLhs);
+    }
+    if (!rhsDims.empty()) {
+      rhsDims.append(rhsShape.begin(), rhsShape.end());
+      auto expandedType =
+          VectorType::get(rhsDims, contractOp.getRhsType().getElementType());
+      newRhs = rewriter.create<vector::BroadcastOp>(loc, expandedType, newRhs);
+    }
+    bool isInt = contractOp.getLhsType().getElementType().isIntOrIndex();
+    newLhs = rewriter.create<vector::TransposeOp>(loc, newLhs, lhsTranspose);
+    newRhs = rewriter.create<vector::TransposeOp>(loc, newRhs, rhsTranspose);
+    SmallVector<int64_t> lhsOffsets(lhsReductionDims.size(), 0);
+    SmallVector<int64_t> rhsOffsets(rhsReductionDims.size(), 0);
+    newLhs = rewriter.create<vector::ExtractOp>(
+        loc, newLhs, rewriter.getI64ArrayAttr(lhsOffsets));
+    newRhs = rewriter.create<vector::ExtractOp>(
+        loc, newRhs, rewriter.getI64ArrayAttr(rhsOffsets));
+    std::optional<Value> result =
+        createContractArithOp(loc, newLhs, newRhs, contractOp.getAcc(),
+                              contractOp.getKind(), rewriter, isInt);
+    rewriter.replaceOp(contractOp, {*result});
+    return success();
+  }
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformOptions;
+  FilterConstraintType filter;
+};
+
+/// Progressive lowering of ContractionOp.
+/// One:
+///   %x = vector.contract with at least one free/batch dimension
+/// is replaced by:
+///   %a = vector.contract with one less free/batch dimension
+///   %b = vector.contract with one less free/batch dimension
+///   ..
+///   %x = combine %a %b ..
+/// until a pure contraction is reached (no free/batch dimensions),
+/// which is replaced by a dot-product.
+///
+/// This only kicks in when either VectorTransformsOptions is set
+/// to DOT or when other contraction patterns fail.
+//
+// TODO: break down into transpose/reshape/cast ops
+//               when they become available to avoid code dup
+// TODO: investigate lowering order impact on performance
+LogicalResult
+ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
+                                       PatternRewriter &rewriter) const {
+  // TODO: Remove native masks from contraction op?
+  if (!op.getMasks().empty())
+    return failure();
+
+  if (failed(filter(op)))
+    return failure();
+
+  // TODO: support mixed mode contract lowering.
+  if (op.getLhsType().getElementType() !=
+          getElementTypeOrSelf(op.getAccType()) ||
+      op.getRhsType().getElementType() != getElementTypeOrSelf(op.getAccType()))
+    return failure();
+
+  // TODO: the code below assumes the default contraction, make sure it supports
+  // other kinds before enabling this lowering.
+  if (op.getKind() != vector::CombiningKind::ADD) {
+    return rewriter.notifyMatchFailure(
+        op, "contractions other than 'add' not supported");
+  }
+
+  // TODO: implement benefits, cost models.
+  MLIRContext *ctx = op.getContext();
+  ContractionOpToMatmulOpLowering pat1(vectorTransformOptions, ctx);
+  if (succeeded(pat1.matchAndRewrite(op, rewriter)))
+    return success();
+  ContractionOpToOuterProductOpLowering pat2(vectorTransformOptions, ctx);
+  if (succeeded(pat2.matchAndRewrite(op, rewriter)))
+    return success();
+  ContractionOpToDotLowering pat3(vectorTransformOptions, ctx);
+  if (succeeded(pat3.matchAndRewrite(op, rewriter)))
+    return success();
+  ContractOpToElementwise pat4(vectorTransformOptions, ctx);
+  if (succeeded(pat4.matchAndRewrite(op, rewriter)))
+    return success();
+
+  // Vector mask setup.
+  OpBuilder::InsertionGuard guard(rewriter);
+  Operation *rootOp = op;
+  Value mask;
+  if (op.isMasked()) {
+    rewriter.setInsertionPoint(op.getMaskingOp());
+    rootOp = op.getMaskingOp();
+    mask = op.getMaskingOp().getMask();
+  }
+
+  // Find first batch dimension in LHS/RHS, and lower when found.
+  std::vector<std::pair<int64_t, int64_t>> batchDimMap = op.getBatchDimMap();
+  if (!batchDimMap.empty()) {
+    int64_t lhsIndex = batchDimMap[0].first;
+    int64_t rhsIndex = batchDimMap[0].second;
+    auto newOp = lowerParallel(rewriter, op, lhsIndex, rhsIndex, mask);
+    if (failed(newOp))
+      return failure();
+    rewriter.replaceOp(rootOp, *newOp);
+    return success();
+  }
+
+  // Collect contracting dimensions.
+  std::vector<std::pair<int64_t, int64_t>> contractingDimMap =
+      op.getContractingDimMap();
+  DenseSet<int64_t> lhsContractingDimSet;
+  DenseSet<int64_t> rhsContractingDimSet;
+  for (auto &dimPair : contractingDimMap) {
+    lhsContractingDimSet.insert(dimPair.first);
+    rhsContractingDimSet.insert(dimPair.second);
+  }
+
+  // Find first free dimension in LHS, and lower when found.
+  VectorType lhsType = op.getLhsType();
+  for (int64_t lhsIndex = 0, e = lhsType.getRank(); lhsIndex < e; ++lhsIndex) {
+    if (lhsContractingDimSet.count(lhsIndex) == 0) {
+      auto newOp = lowerParallel(rewriter, op, lhsIndex, /*rhsIndex=*/-1, mask);
+      if (failed(newOp))
+        return failure();
+      rewriter.replaceOp(rootOp, *newOp);
+      return success();
+    }
+  }
+
+  // Find first free dimension in RHS, and lower when found.
+  VectorType rhsType = op.getRhsType();
+  for (int64_t rhsIndex = 0, e = rhsType.getRank(); rhsIndex < e; ++rhsIndex) {
+    if (rhsContractingDimSet.count(rhsIndex) == 0) {
+      auto newOp = lowerParallel(rewriter, op, /*lhsIndex=*/-1, rhsIndex, mask);
+      if (failed(newOp))
+        return failure();
+      rewriter.replaceOp(rootOp, *newOp);
+      return success();
+    }
+  }
+
+  // Lower the first remaining reduction dimension.
+  if (!contractingDimMap.empty()) {
+    auto newOp = lowerReduction(rewriter, op, mask);
+    if (failed(newOp))
+      return failure();
+    rewriter.replaceOp(rootOp, *newOp);
+    return success();
+  }
+
+  return failure();
+}
+
+// Lower one parallel dimension.
+// Incidentally also tolerates unit-size (hence trivial) reduction dimensions.
+// TODO: consider reusing existing contract unrolling
+FailureOr<Value> ContractionOpLowering::lowerParallel(PatternRewriter &rewriter,
+                                                      vector::ContractionOp op,
+                                                      int64_t lhsIndex,
+                                                      int64_t rhsIndex,
+                                                      Value mask) const {
+  VectorType lhsType = op.getLhsType();
+  VectorType rhsType = op.getRhsType();
+  VectorType resType = op.getResultType().cast<VectorType>();
+  // Find the iterator type index and result index.
+  SmallVector<AffineMap> iMap = op.getIndexingMapsArray();
+  int64_t iterIndex = -1;
+  int64_t dimSize = -1;
+  if (lhsIndex >= 0) {
+    iterIndex = iMap[0].getDimPosition(lhsIndex);
+    if (rhsIndex >= 0 && iterIndex != iMap[1].getDimPosition(rhsIndex))
+      return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
+        diag << "expected lhsIndex=" << lhsIndex << " and rhsIndex=" << rhsIndex
+             << " to map to the same dimension";
+      });
+    dimSize = lhsType.getDimSize(lhsIndex);
+  } else if (rhsIndex >= 0) {
+    iterIndex = iMap[1].getDimPosition(rhsIndex);
+    dimSize = rhsType.getDimSize(rhsIndex);
+  }
+  if (iterIndex < 0)
+    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
+      diag << "expected either lhsIndex=" << lhsIndex
+           << " or rhsIndex=" << rhsIndex << " to be nonnegative";
+    });
+  // value_or(-1) means that we tolerate a dimension not appearing
+  // in the result map. That can't happen for actual parallel iterators, but
+  // the caller ContractionOpLowering::matchAndRewrite is currently calling
+  // lowerParallel also for the case of unit-size reduction dims appearing only
+  // on one of LHS or RHS, not both. At the moment, such cases are created by
+  // CastAwayContractionLeadingOneDim, so we need to either support that or
+  // modify that pattern.
+  int64_t resIndex = getResultIndex(iMap[2], iterIndex).value_or(-1);
+  if (resIndex == -1 && dimSize != 1)
+    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
+      diag << "expected the dimension for iterIndex=" << iterIndex
+           << " to either appear in the result map, or to be a unit dimension";
+    });
+
+  // Construct new iterator types and affine map array attribute.
+  std::array<AffineMap, 3> lowIndexingMaps = {
+      adjustMap(iMap[0], iterIndex, rewriter),
+      adjustMap(iMap[1], iterIndex, rewriter),
+      adjustMap(iMap[2], iterIndex, rewriter)};
+  auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
+  auto lowIter =
+      rewriter.getArrayAttr(adjustIter(op.getIteratorTypes(), iterIndex));
+  // Unroll into a series of lower dimensional vector.contract ops.
+  Location loc = op.getLoc();
+  Value result = rewriter.create<arith::ConstantOp>(
+      loc, resType, rewriter.getZeroAttr(resType));
+
+  for (int64_t d = 0; d < dimSize; ++d) {
+    auto lhs = reshapeLoad(loc, op.getLhs(), lhsType, lhsIndex, d, rewriter);
+    auto rhs = reshapeLoad(loc, op.getRhs(), rhsType, rhsIndex, d, rewriter);
+    auto acc = reshapeLoad(loc, op.getAcc(), resType, resIndex, d, rewriter);
+
+    Value lowMask;
+    if (mask)
+      lowMask = reshapeLoad(loc, mask, cast<VectorType>(mask.getType()),
+                            iterIndex, d, rewriter);
+
+    Operation *lowContract = rewriter.create<vector::ContractionOp>(
+        loc, lhs, rhs, acc, lowAffine, lowIter);
+    lowContract = maskOperation(rewriter, lowContract, lowMask);
+    result = reshapeStore(loc, lowContract->getResult(0), result, resType,
+                          resIndex, d, rewriter);
+  }
+  return result;
+}
+
+// Lower one reduction dimension.
+FailureOr<Value> ContractionOpLowering::lowerReduction(
+    PatternRewriter &rewriter, vector::ContractionOp op, Value mask) const {
+  auto loc = op.getLoc();
+  VectorType lhsType = op.getLhsType();
+  VectorType rhsType = op.getRhsType();
+  Type resType = op.getResultType();
+  if (resType.isa<VectorType>())
+    return rewriter.notifyMatchFailure(op,
+                                       "did not expect a VectorType result");
+  bool isInt = resType.isa<IntegerType>();
+  // Use iterator index 0.
+  int64_t iterIndex = 0;
+  SmallVector<AffineMap> iMap = op.getIndexingMapsArray();
+  std::optional<int64_t> lookupLhs = getResultIndex(iMap[0], iterIndex);
+  std::optional<int64_t> lookupRhs = getResultIndex(iMap[1], iterIndex);
+  if (!lookupLhs.has_value())
+    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
+      diag << "expected iterIndex=" << iterIndex << "to map to a LHS dimension";
+    });
+  if (!lookupRhs.has_value())
+    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
+      diag << "expected iterIndex=" << iterIndex << "to map to a RHS dimension";
+    });
+  int64_t lhsIndex = *lookupLhs;
+  int64_t rhsIndex = *lookupRhs;
+  int64_t dimSize = lhsType.getDimSize(lhsIndex);
+  if (dimSize != rhsType.getDimSize(rhsIndex))
+    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
+      diag << "expect LHS dimension " << lhsIndex
+           << " to have the same size as RHS dimension " << rhsIndex;
+    });
+  // Base case.
+  if (lhsType.getRank() == 1) {
+    if (rhsType.getRank() != 1)
+      return rewriter.notifyMatchFailure(
+          op, "When LHS has rank 1, expected also RHS to have rank 1");
+    Value m = createMul(loc, op.getLhs(), op.getRhs(), isInt, rewriter);
+    auto kind = vector::CombiningKind::ADD;
+
+    Value acc = op.getAcc();
+    Operation *reductionOp =
+        acc ? rewriter.create<vector::ReductionOp>(loc, kind, m, acc)
+            : rewriter.create<vector::ReductionOp>(loc, kind, m);
+    return maskOperation(rewriter, reductionOp, mask)->getResult(0);
+  }
+  // Construct new iterator types and affine map array attribute.
+  std::array<AffineMap, 3> lowIndexingMaps = {
+      adjustMap(iMap[0], iterIndex, rewriter),
+      adjustMap(iMap[1], iterIndex, rewriter),
+      adjustMap(iMap[2], iterIndex, rewriter)};
+  auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
+  auto lowIter =
+      rewriter.getArrayAttr(adjustIter(op.getIteratorTypes(), iterIndex));
+  // Unroll into a series of lower dimensional vector.contract ops.
+  // By feeding the initial accumulator into the first contraction,
+  // and the result of each contraction into the next, eventually
+  // the sum of all reductions is computed.
+  Value result = op.getAcc();
+  for (int64_t d = 0; d < dimSize; ++d) {
+    auto lhs = reshapeLoad(loc, op.getLhs(), lhsType, lhsIndex, d, rewriter);
+    auto rhs = reshapeLoad(loc, op.getRhs(), rhsType, rhsIndex, d, rewriter);
+    Value newMask;
+    if (mask)
+      newMask = reshapeLoad(loc, mask, cast<VectorType>(mask.getType()),
+                            iterIndex, d, rewriter);
+
+    Operation *newContract = rewriter.create<vector::ContractionOp>(
+        loc, lhs, rhs, result, lowAffine, lowIter);
+    result = maskOperation(rewriter, newContract, newMask)->getResult(0);
+  }
+  return result;
+}
+
+/// Progressive lowering of OuterProductOp.
+/// One:
+///   %x = vector.outerproduct %lhs, %rhs, %acc
+/// is replaced by:
+///   %z = zero-result
+///   %0 = vector.extract %lhs[0]
+///   %1 = vector.broadcast %0
+///   %2 = vector.extract %acc[0]
+///   %3 = vector.fma %1, %rhs, %2
+///   %4 = vector.insert %3, %z[0]
+///   ..
+///   %x = vector.insert %.., %..[N-1]
+///
+class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::OuterProductOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    VectorType lhsType = op.getOperandVectorTypeLHS();
+    VectorType rhsType = op.getOperandTypeRHS().dyn_cast<VectorType>();
+    VectorType resType = op.getResultVectorType();
+    Type eltType = resType.getElementType();
+    bool isInt = eltType.isa<IntegerType, IndexType>();
+    Value acc = (op.getAcc().empty()) ? nullptr : op.getAcc()[0];
+    vector::CombiningKind kind = op.getKind();
+
+    // Vector mask setup.
+    OpBuilder::InsertionGuard guard(rewriter);
+    auto maskableOp = cast<vector::MaskableOpInterface>(op.getOperation());
+    Operation *rootOp;
+    Value mask;
+    if (maskableOp.isMasked()) {
+      rewriter.setInsertionPoint(maskableOp.getMaskingOp());
+      rootOp = maskableOp.getMaskingOp();
+      mask = maskableOp.getMaskingOp().getMask();
+    } else {
+      rootOp = op;
+    }
+
+    if (!rhsType) {
+      // Special case: AXPY operation.
+      Value b = rewriter.create<vector::BroadcastOp>(loc, lhsType, op.getRhs());
+      std::optional<Value> mult = createContractArithOp(
+          loc, op.getLhs(), b, acc, kind, rewriter, isInt, mask);
+      if (!mult.has_value())
+        return failure();
+      rewriter.replaceOp(rootOp, *mult);
+      return success();
+    }
+
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, resType, rewriter.getZeroAttr(resType));
+    for (int64_t d = 0, e = resType.getDimSize(0); d < e; ++d) {
+      auto pos = rewriter.getI64ArrayAttr(d);
+      Value x = rewriter.create<vector::ExtractOp>(loc, op.getLhs(), pos);
+      Value a = rewriter.create<vector::BroadcastOp>(loc, rhsType, x);
+      Value r = nullptr;
+      if (acc)
+        r = rewriter.create<vector::ExtractOp>(loc, acc, pos);
+      Value extrMask;
+      if (mask)
+        extrMask = rewriter.create<vector::ExtractOp>(loc, mask, pos);
+
+      std::optional<Value> m = createContractArithOp(
+          loc, a, op.getRhs(), r, kind, rewriter, isInt, extrMask);
+      if (!m.has_value())
+        return failure();
+      result = rewriter.create<vector::InsertOp>(loc, resType, *m, result, pos);
+    }
+
+    rewriter.replaceOp(rootOp, result);
+    return success();
+  }
+};
+
+/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to:
+/// ```
+///    %mta = maybe_transpose
+///    %mtb = maybe_transpose
+///    %flattened_a = vector.shape_cast %mta
+///    %flattened_b = vector.shape_cast %mtb
+///    %flattened_d = vector.matmul %flattened_a, %flattened_b
+///    %mtd = vector.shape_cast %flattened_d
+///    %d = maybe_untranspose %mtd
+///    %e = add %c, %d
+/// ```
+/// `vector.matmul` later lowers to `llvm.matrix.multiply`.
+//
+/// This only kicks in when VectorTransformsOptions is set to `Matmul`.
+/// vector.transpose operations are inserted if the vector.contract op is not a
+/// row-major matrix multiply.
+LogicalResult
+ContractionOpToMatmulOpLowering::matchAndRewrite(vector::ContractionOp op,
+                                                 PatternRewriter &rew) const {
+  // TODO: Support vector.mask.
+  auto maskableOp = cast<MaskableOpInterface>(op.getOperation());
+  if (maskableOp.isMasked())
+    return failure();
+
+  // TODO: Remove native masks from contraction op?
+  if (!op.getMasks().empty())
+    return failure();
+  if (vectorTransformOptions.vectorContractLowering !=
+      vector::VectorContractLowering::Matmul)
+    return failure();
+  if (failed(filter(op)))
+    return failure();
+
+  auto iteratorTypes = op.getIteratorTypes().getValue();
+  if (!isParallelIterator(iteratorTypes[0]) ||
+      !isParallelIterator(iteratorTypes[1]) ||
+      !isReductionIterator(iteratorTypes[2]))
+    return failure();
+
+  Type elementType = op.getLhsType().getElementType();
+  if (!elementType.isIntOrFloat())
+    return failure();
+
+  Type dstElementType = op.getType();
+  if (auto vecType = dstElementType.dyn_cast<VectorType>())
+    dstElementType = vecType.getElementType();
+  if (elementType != dstElementType)
+    return failure();
+
+  // Perform lhs + rhs transpositions to conform to matmul row-major semantics.
+  // Bail out if the contraction cannot be put in this form.
+  MLIRContext *ctx = op.getContext();
+  Location loc = op.getLoc();
+  AffineExpr m, n, k;
+  bindDims(rew.getContext(), m, n, k);
+  // LHS must be A(m, k) or A(k, m).
+  Value lhs = op.getLhs();
+  auto lhsMap = op.getIndexingMapsArray()[0];
+  if (lhsMap == AffineMap::get(3, 0, {k, m}, ctx))
+    lhs = rew.create<vector::TransposeOp>(loc, lhs, ArrayRef<int64_t>{1, 0});
+  else if (lhsMap != AffineMap::get(3, 0, {m, k}, ctx))
+    return failure();
+
+  // RHS must be B(k, n) or B(n, k).
+  Value rhs = op.getRhs();
+  auto rhsMap = op.getIndexingMapsArray()[1];
+  if (rhsMap == AffineMap::get(3, 0, {n, k}, ctx))
+    rhs = rew.create<vector::TransposeOp>(loc, rhs, ArrayRef<int64_t>{1, 0});
+  else if (rhsMap != AffineMap::get(3, 0, {k, n}, ctx))
+    return failure();
+
+  // At this point lhs and rhs are in row-major.
+  VectorType lhsType = lhs.getType().cast<VectorType>();
+  VectorType rhsType = rhs.getType().cast<VectorType>();
+  int64_t lhsRows = lhsType.getDimSize(0);
+  int64_t lhsColumns = lhsType.getDimSize(1);
+  int64_t rhsColumns = rhsType.getDimSize(1);
+
+  Type flattenedLHSType =
+      VectorType::get(lhsType.getNumElements(), lhsType.getElementType());
+  lhs = rew.create<vector::ShapeCastOp>(loc, flattenedLHSType, lhs);
+
+  Type flattenedRHSType =
+      VectorType::get(rhsType.getNumElements(), rhsType.getElementType());
+  rhs = rew.create<vector::ShapeCastOp>(loc, flattenedRHSType, rhs);
+
+  Value mul = rew.create<vector::MatmulOp>(loc, lhs, rhs, lhsRows, lhsColumns,
+                                           rhsColumns);
+  mul = rew.create<vector::ShapeCastOp>(
+      loc,
+      VectorType::get({lhsRows, rhsColumns},
+                      getElementTypeOrSelf(op.getAcc().getType())),
+      mul);
+
+  // ACC must be C(m, n) or C(n, m).
+  auto accMap = op.getIndexingMapsArray()[2];
+  if (accMap == AffineMap::get(3, 0, {n, m}, ctx))
+    mul = rew.create<vector::TransposeOp>(loc, mul, ArrayRef<int64_t>{1, 0});
+  else if (accMap != AffineMap::get(3, 0, {m, n}, ctx))
+    llvm_unreachable("invalid contraction semantics");
+
+  Value res =
+      elementType.isa<IntegerType>()
+          ? static_cast<Value>(rew.create<arith::AddIOp>(loc, op.getAcc(), mul))
+          : static_cast<Value>(
+                rew.create<arith::AddFOp>(loc, op.getAcc(), mul));
+
+  rew.replaceOp(op, res);
+  return success();
+}
+} // namespace
+
+void mlir::vector::populateVectorContractLoweringPatterns(
+    RewritePatternSet &patterns, VectorTransformsOptions options,
+    PatternBenefit benefit, bool disableOuterProductLowering) {
+  if (!disableOuterProductLowering)
+    patterns.add<OuterProductOpLowering>(patterns.getContext(), benefit);
+  patterns.add<ContractionOpLowering, ContractionOpToMatmulOpLowering,
+               ContractionOpToOuterProductOpLowering>(
+      options, patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
new file mode 100644
index 0000000000000..dc10cb6278cb8
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
@@ -0,0 +1,173 @@
+//===- LowerVectorScam.cpp - Lower 'vector.scan' operation ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites and utilities to lower the
+// 'vector.scan' operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/VectorInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+
+#define DEBUG_TYPE "vector-broadcast-lowering"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+namespace {
+/// Flattens 2 or more dimensional `vector.gather` ops by unrolling the
+/// outermost dimension. For example:
+/// ```
+/// %g = vector.gather %base[%c0][%v], %mask, %pass_thru :
+///        ... into vector<2x3xf32>
+///
+/// ==>
+///
+/// %0   = arith.constant dense<0.0> : vector<2x3xf32>
+/// %g0  = vector.gather %base[%c0][%v0], %mask0, %pass_thru0 : ...
+/// %1   = vector.insert %g0, %0 [0] : vector<3xf32> into vector<2x3xf32>
+/// %g1  = vector.gather %base[%c0][%v1], %mask1, %pass_thru1 : ...
+/// %g   = vector.insert %g1, %1 [1] : vector<3xf32> into vector<2x3xf32>
+/// ```
+///
+/// When applied exhaustively, this will produce a sequence of 1-d gather ops.
+struct FlattenGather : OpRewritePattern<vector::GatherOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::GatherOp op,
+                                PatternRewriter &rewriter) const override {
+    VectorType resultTy = op.getType();
+    if (resultTy.getRank() < 2)
+      return rewriter.notifyMatchFailure(op, "already flat");
+
+    Location loc = op.getLoc();
+    Value indexVec = op.getIndexVec();
+    Value maskVec = op.getMask();
+    Value passThruVec = op.getPassThru();
+
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, resultTy, rewriter.getZeroAttr(resultTy));
+
+    Type subTy = VectorType::get(resultTy.getShape().drop_front(),
+                                 resultTy.getElementType());
+
+    for (int64_t i = 0, e = resultTy.getShape().front(); i < e; ++i) {
+      int64_t thisIdx[1] = {i};
+
+      Value indexSubVec =
+          rewriter.create<vector::ExtractOp>(loc, indexVec, thisIdx);
+      Value maskSubVec =
+          rewriter.create<vector::ExtractOp>(loc, maskVec, thisIdx);
+      Value passThruSubVec =
+          rewriter.create<vector::ExtractOp>(loc, passThruVec, thisIdx);
+      Value subGather = rewriter.create<vector::GatherOp>(
+          loc, subTy, op.getBase(), op.getIndices(), indexSubVec, maskSubVec,
+          passThruSubVec);
+      result =
+          rewriter.create<vector::InsertOp>(loc, subGather, result, thisIdx);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+/// Turns 1-d `vector.gather` into a scalarized sequence of `vector.loads` or
+/// `tensor.extract`s. To avoid out-of-bounds memory accesses, these
+/// loads/extracts are made conditional using `scf.if` ops.
+struct Gather1DToConditionalLoads : OpRewritePattern<vector::GatherOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::GatherOp op,
+                                PatternRewriter &rewriter) const override {
+    VectorType resultTy = op.getType();
+    if (resultTy.getRank() != 1)
+      return rewriter.notifyMatchFailure(op, "unsupported rank");
+
+    Location loc = op.getLoc();
+    Type elemTy = resultTy.getElementType();
+    // Vector type with a single element. Used to generate `vector.loads`.
+    VectorType elemVecTy = VectorType::get({1}, elemTy);
+
+    Value condMask = op.getMask();
+    Value base = op.getBase();
+    Value indexVec = rewriter.createOrFold<arith::IndexCastOp>(
+        loc, op.getIndexVectorType().clone(rewriter.getIndexType()),
+        op.getIndexVec());
+    auto baseOffsets = llvm::to_vector(op.getIndices());
+    Value lastBaseOffset = baseOffsets.back();
+
+    Value result = op.getPassThru();
+
+    // Emit a conditional access for each vector element.
+    for (int64_t i = 0, e = resultTy.getNumElements(); i < e; ++i) {
+      int64_t thisIdx[1] = {i};
+      Value condition =
+          rewriter.create<vector::ExtractOp>(loc, condMask, thisIdx);
+      Value index = rewriter.create<vector::ExtractOp>(loc, indexVec, thisIdx);
+      baseOffsets.back() =
+          rewriter.createOrFold<arith::AddIOp>(loc, lastBaseOffset, index);
+
+      auto loadBuilder = [&](OpBuilder &b, Location loc) {
+        Value extracted;
+        if (isa<MemRefType>(base.getType())) {
+          // `vector.load` does not support scalar result; emit a vector load
+          // and extract the single result instead.
+          Value load =
+              b.create<vector::LoadOp>(loc, elemVecTy, base, baseOffsets);
+          int64_t zeroIdx[1] = {0};
+          extracted = b.create<vector::ExtractOp>(loc, load, zeroIdx);
+        } else {
+          extracted = b.create<tensor::ExtractOp>(loc, base, baseOffsets);
+        }
+
+        Value newResult =
+            b.create<vector::InsertOp>(loc, extracted, result, thisIdx);
+        b.create<scf::YieldOp>(loc, newResult);
+      };
+      auto passThruBuilder = [result](OpBuilder &b, Location loc) {
+        b.create<scf::YieldOp>(loc, result);
+      };
+
+      result =
+          rewriter
+              .create<scf::IfOp>(loc, condition, /*thenBuilder=*/loadBuilder,
+                                 /*elseBuilder=*/passThruBuilder)
+              .getResult(0);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+} // namespace
+
+void mlir::vector::populateVectorGatherLoweringPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<FlattenGather, Gather1DToConditionalLoads>(patterns.getContext(),
+                                                          benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
index 7c66e65fdef8b..e318d4dc15915 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements target-independent rewrites and utilitites to lower the
+// This file implements target-independent rewrites and utilities to lower the
 // 'vector.mask' operation.
 //
 //===----------------------------------------------------------------------===//
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/Passes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -30,6 +31,147 @@ namespace vector {
 using namespace mlir;
 using namespace mlir::vector;
 
+//===----------------------------------------------------------------------===//
+// populateVectorMaskOpLoweringPatterns
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Progressive lowering of CreateMaskOp.
+/// One:
+///   %x = vector.create_mask %a, ... : vector<dx...>
+/// is replaced by:
+///   %l = vector.create_mask ... : vector<...>  ; one lower rank
+///   %0 = arith.cmpi "slt", %ci, %a       |
+///   %1 = select %0, %l, %zeroes    |
+///   %r = vector.insert %1, %pr [i] | d-times
+///   %x = ....
+/// until a one-dimensional vector is reached.
+class CreateMaskOpLowering : public OpRewritePattern<vector::CreateMaskOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::CreateMaskOp op,
+                                PatternRewriter &rewriter) const override {
+    auto dstType = op.getResult().getType().cast<VectorType>();
+    int64_t rank = dstType.getRank();
+    if (rank <= 1)
+      return rewriter.notifyMatchFailure(
+          op, "0-D and 1-D vectors are handled separately");
+
+    auto loc = op.getLoc();
+    auto eltType = dstType.getElementType();
+    int64_t dim = dstType.getDimSize(0);
+    Value idx = op.getOperand(0);
+
+    VectorType lowType =
+        VectorType::get(dstType.getShape().drop_front(), eltType);
+    Value trueVal = rewriter.create<vector::CreateMaskOp>(
+        loc, lowType, op.getOperands().drop_front());
+    Value falseVal = rewriter.create<arith::ConstantOp>(
+        loc, lowType, rewriter.getZeroAttr(lowType));
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, dstType, rewriter.getZeroAttr(dstType));
+    for (int64_t d = 0; d < dim; d++) {
+      Value bnd =
+          rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(d));
+      Value val = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
+                                                 bnd, idx);
+      Value sel = rewriter.create<arith::SelectOp>(loc, val, trueVal, falseVal);
+      auto pos = rewriter.getI64ArrayAttr(d);
+      result =
+          rewriter.create<vector::InsertOp>(loc, dstType, sel, result, pos);
+    }
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+/// Progressive lowering of ConstantMaskOp.
+/// One:
+///   %x = vector.constant_mask [a,b]
+/// is replaced by:
+///   %z = zero-result
+///   %l = vector.constant_mask [b]
+///   %4 = vector.insert %l, %z[0]
+///   ..
+///   %x = vector.insert %l, %..[a-1]
+/// until a one-dimensional vector is reached. All these operations
+/// will be folded at LLVM IR level.
+class ConstantMaskOpLowering : public OpRewritePattern<vector::ConstantMaskOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ConstantMaskOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto dstType = op.getType();
+    auto eltType = dstType.getElementType();
+    auto dimSizes = op.getMaskDimSizes();
+    int64_t rank = dstType.getRank();
+
+    if (rank == 0) {
+      assert(dimSizes.size() == 1 &&
+             "Expected exactly one dim size for a 0-D vector");
+      bool value = dimSizes[0].cast<IntegerAttr>().getInt() == 1;
+      rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+          op, dstType,
+          DenseIntElementsAttr::get(
+              VectorType::get(ArrayRef<int64_t>{}, rewriter.getI1Type()),
+              ArrayRef<bool>{value}));
+      return success();
+    }
+
+    // Scalable constant masks can only be lowered for the "none set" case.
+    if (dstType.cast<VectorType>().isScalable()) {
+      rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+          op, DenseElementsAttr::get(dstType, false));
+      return success();
+    }
+
+    int64_t trueDim = std::min(dstType.getDimSize(0),
+                               dimSizes[0].cast<IntegerAttr>().getInt());
+
+    if (rank == 1) {
+      // Express constant 1-D case in explicit vector form:
+      //   [T,..,T,F,..,F].
+      SmallVector<bool> values(dstType.getDimSize(0));
+      for (int64_t d = 0; d < trueDim; d++)
+        values[d] = true;
+      rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+          op, dstType, rewriter.getBoolVectorAttr(values));
+      return success();
+    }
+
+    VectorType lowType =
+        VectorType::get(dstType.getShape().drop_front(), eltType);
+    SmallVector<int64_t> newDimSizes;
+    for (int64_t r = 1; r < rank; r++)
+      newDimSizes.push_back(dimSizes[r].cast<IntegerAttr>().getInt());
+    Value trueVal = rewriter.create<vector::ConstantMaskOp>(
+        loc, lowType, rewriter.getI64ArrayAttr(newDimSizes));
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, dstType, rewriter.getZeroAttr(dstType));
+    for (int64_t d = 0; d < trueDim; d++) {
+      auto pos = rewriter.getI64ArrayAttr(d);
+      result =
+          rewriter.create<vector::InsertOp>(loc, dstType, trueVal, result, pos);
+    }
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+} // namespace
+
+void mlir::vector::populateVectorMaskOpLoweringPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<CreateMaskOpLowering, ConstantMaskOpLowering>(
+      patterns.getContext(), benefit);
+}
+
+//===----------------------------------------------------------------------===//
+// populateVectorMaskLoweringPatternsForSideEffectingOps
+//===----------------------------------------------------------------------===//
+
 namespace {
 
 /// The `MaskOpRewritePattern` implements a pattern that follows a two-fold
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorMultiDimReductionTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
similarity index 98%
rename from mlir/lib/Dialect/Vector/Transforms/VectorMultiDimReductionTransforms.cpp
rename to mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
index b790d141415aa..1744c46db5886 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorMultiDimReductionTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
@@ -1,4 +1,4 @@
-//===- VectorMultiDimReductionTransforms.cpp - Multi-Reduction Transforms -===//
+//===- LowerVectorMultiReduction.cpp - Lower `vector.multi_reduction` op --===//
 //
 /// Part of the LLVM Project, under the Apache License v2.0 with LLVM
 /// Exceptions. See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// This file implements target-independent rewrites of MultiDimReductionOp.
+// This file implements target-independent rewrites and utilities to lower the
+// 'vector.multi_reduction' operation.
 //
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/TypeUtilities.h"
 
@@ -19,6 +20,7 @@
 
 using namespace mlir;
 
+namespace {
 /// This file implements the following transformations as composable atomic
 /// patterns.
 
@@ -441,6 +443,7 @@ struct OneDimMultiReductionToTwoDim
     return success();
   }
 };
+} // namespace
 
 void mlir::vector::populateVectorMultiReductionLoweringPatterns(
     RewritePatternSet &patterns, VectorMultiReductionLowering options,
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
new file mode 100644
index 0000000000000..eb2deba7bc46b
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
@@ -0,0 +1,251 @@
+//===- LowerVectorScam.cpp - Lower 'vector.scan' operation ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites and utilities to lower the
+// 'vector.scan' operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/VectorInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+
+#define DEBUG_TYPE "vector-broadcast-lowering"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+/// This function constructs the appropriate integer or float
+/// operation given the vector combining kind and operands. The
+/// supported int operations are : add, mul, min (signed/unsigned),
+/// max(signed/unsigned), and, or, xor. The supported float
+/// operations are : add, mul, min and max.
+static Value genOperator(Location loc, Value x, Value y,
+                         vector::CombiningKind kind,
+                         PatternRewriter &rewriter) {
+  using vector::CombiningKind;
+
+  auto elType = x.getType().cast<VectorType>().getElementType();
+  bool isInt = elType.isIntOrIndex();
+
+  Value combinedResult{nullptr};
+  switch (kind) {
+  case CombiningKind::ADD:
+    if (isInt)
+      combinedResult = rewriter.create<arith::AddIOp>(loc, x, y);
+    else
+      combinedResult = rewriter.create<arith::AddFOp>(loc, x, y);
+    break;
+  case CombiningKind::MUL:
+    if (isInt)
+      combinedResult = rewriter.create<arith::MulIOp>(loc, x, y);
+    else
+      combinedResult = rewriter.create<arith::MulFOp>(loc, x, y);
+    break;
+  case CombiningKind::MINUI:
+    combinedResult = rewriter.create<arith::MinUIOp>(loc, x, y);
+    break;
+  case CombiningKind::MINSI:
+    combinedResult = rewriter.create<arith::MinSIOp>(loc, x, y);
+    break;
+  case CombiningKind::MAXUI:
+    combinedResult = rewriter.create<arith::MaxUIOp>(loc, x, y);
+    break;
+  case CombiningKind::MAXSI:
+    combinedResult = rewriter.create<arith::MaxSIOp>(loc, x, y);
+    break;
+  case CombiningKind::AND:
+    combinedResult = rewriter.create<arith::AndIOp>(loc, x, y);
+    break;
+  case CombiningKind::OR:
+    combinedResult = rewriter.create<arith::OrIOp>(loc, x, y);
+    break;
+  case CombiningKind::XOR:
+    combinedResult = rewriter.create<arith::XOrIOp>(loc, x, y);
+    break;
+  case CombiningKind::MINF:
+    combinedResult = rewriter.create<arith::MinFOp>(loc, x, y);
+    break;
+  case CombiningKind::MAXF:
+    combinedResult = rewriter.create<arith::MaxFOp>(loc, x, y);
+    break;
+  }
+  return combinedResult;
+}
+
+/// This function checks to see if the vector combining kind
+/// is consistent with the integer or float element type.
+static bool isValidKind(bool isInt, vector::CombiningKind kind) {
+  using vector::CombiningKind;
+  enum class KindType { FLOAT, INT, INVALID };
+  KindType type{KindType::INVALID};
+  switch (kind) {
+  case CombiningKind::MINF:
+  case CombiningKind::MAXF:
+    type = KindType::FLOAT;
+    break;
+  case CombiningKind::MINUI:
+  case CombiningKind::MINSI:
+  case CombiningKind::MAXUI:
+  case CombiningKind::MAXSI:
+  case CombiningKind::AND:
+  case CombiningKind::OR:
+  case CombiningKind::XOR:
+    type = KindType::INT;
+    break;
+  case CombiningKind::ADD:
+  case CombiningKind::MUL:
+    type = isInt ? KindType::INT : KindType::FLOAT;
+    break;
+  }
+  bool isValidIntKind = (type == KindType::INT) && isInt;
+  bool isValidFloatKind = (type == KindType::FLOAT) && (!isInt);
+  return (isValidIntKind || isValidFloatKind);
+}
+
+namespace {
+/// Convert vector.scan op into arith ops and vector.insert_strided_slice /
+/// vector.extract_strided_slice.
+///
+/// Example:
+///
+/// ```
+///   %0:2 = vector.scan <add>, %arg0, %arg1
+///     {inclusive = true, reduction_dim = 1} :
+///     (vector<2x3xi32>, vector<2xi32>) to (vector<2x3xi32>, vector<2xi32>)
+/// ```
+///
+/// is converted to:
+///
+/// ```
+///   %cst = arith.constant dense<0> : vector<2x3xi32>
+///   %0 = vector.extract_strided_slice %arg0
+///     {offsets = [0, 0], sizes = [2, 1], strides = [1, 1]}
+///       : vector<2x3xi32> to vector<2x1xi32>
+///   %1 = vector.insert_strided_slice %0, %cst
+///     {offsets = [0, 0], strides = [1, 1]}
+///       : vector<2x1xi32> into vector<2x3xi32>
+///   %2 = vector.extract_strided_slice %arg0
+///     {offsets = [0, 1], sizes = [2, 1], strides = [1, 1]}
+///       : vector<2x3xi32> to vector<2x1xi32>
+///   %3 = arith.muli %0, %2 : vector<2x1xi32>
+///   %4 = vector.insert_strided_slice %3, %1
+///     {offsets = [0, 1], strides = [1, 1]}
+///       : vector<2x1xi32> into vector<2x3xi32>
+///   %5 = vector.extract_strided_slice %arg0
+///     {offsets = [0, 2], sizes = [2, 1], strides = [1, 1]}
+///       : vector<2x3xi32> to vector<2x1xi32>
+///   %6 = arith.muli %3, %5 : vector<2x1xi32>
+///   %7 = vector.insert_strided_slice %6, %4
+///     {offsets = [0, 2], strides = [1, 1]}
+///       : vector<2x1xi32> into vector<2x3xi32>
+///   %8 = vector.shape_cast %6 : vector<2x1xi32> to vector<2xi32>
+///   return %7, %8 : vector<2x3xi32>, vector<2xi32>
+/// ```
+struct ScanToArithOps : public OpRewritePattern<vector::ScanOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ScanOp scanOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = scanOp.getLoc();
+    VectorType destType = scanOp.getDestType();
+    ArrayRef<int64_t> destShape = destType.getShape();
+    auto elType = destType.getElementType();
+    bool isInt = elType.isIntOrIndex();
+    if (!isValidKind(isInt, scanOp.getKind()))
+      return failure();
+
+    VectorType resType = VectorType::get(destShape, elType);
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, resType, rewriter.getZeroAttr(resType));
+    int64_t reductionDim = scanOp.getReductionDim();
+    bool inclusive = scanOp.getInclusive();
+    int64_t destRank = destType.getRank();
+    VectorType initialValueType = scanOp.getInitialValueType();
+    int64_t initialValueRank = initialValueType.getRank();
+
+    SmallVector<int64_t> reductionShape(destShape.begin(), destShape.end());
+    reductionShape[reductionDim] = 1;
+    VectorType reductionType = VectorType::get(reductionShape, elType);
+    SmallVector<int64_t> offsets(destRank, 0);
+    SmallVector<int64_t> strides(destRank, 1);
+    SmallVector<int64_t> sizes(destShape.begin(), destShape.end());
+    sizes[reductionDim] = 1;
+    ArrayAttr scanSizes = rewriter.getI64ArrayAttr(sizes);
+    ArrayAttr scanStrides = rewriter.getI64ArrayAttr(strides);
+
+    Value lastOutput, lastInput;
+    for (int i = 0; i < destShape[reductionDim]; i++) {
+      offsets[reductionDim] = i;
+      ArrayAttr scanOffsets = rewriter.getI64ArrayAttr(offsets);
+      Value input = rewriter.create<vector::ExtractStridedSliceOp>(
+          loc, reductionType, scanOp.getSource(), scanOffsets, scanSizes,
+          scanStrides);
+      Value output;
+      if (i == 0) {
+        if (inclusive) {
+          output = input;
+        } else {
+          if (initialValueRank == 0) {
+            // ShapeCastOp cannot handle 0-D vectors
+            output = rewriter.create<vector::BroadcastOp>(
+                loc, input.getType(), scanOp.getInitialValue());
+          } else {
+            output = rewriter.create<vector::ShapeCastOp>(
+                loc, input.getType(), scanOp.getInitialValue());
+          }
+        }
+      } else {
+        Value y = inclusive ? input : lastInput;
+        output = genOperator(loc, lastOutput, y, scanOp.getKind(), rewriter);
+        assert(output != nullptr);
+      }
+      result = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, output, result, offsets, strides);
+      lastOutput = output;
+      lastInput = input;
+    }
+
+    Value reduction;
+    if (initialValueRank == 0) {
+      Value v = rewriter.create<vector::ExtractOp>(loc, lastOutput, 0);
+      reduction =
+          rewriter.create<vector::BroadcastOp>(loc, initialValueType, v);
+    } else {
+      reduction = rewriter.create<vector::ShapeCastOp>(loc, initialValueType,
+                                                       lastOutput);
+    }
+
+    rewriter.replaceOp(scanOp, {result, reduction});
+    return success();
+  }
+};
+} // namespace
+
+void mlir::vector::populateVectorScanLoweringPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<ScanToArithOps>(patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
new file mode 100644
index 0000000000000..bd9716cbca94c
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
@@ -0,0 +1,177 @@
+//===- LowerVectorShapeCast.cpp - Lower 'vector.shape_cast' operation -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites and utilities to lower the
+// 'vector.shape_cast' operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/VectorInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+
+#define DEBUG_TYPE "vector-shape-cast-lowering"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+namespace {
+/// ShapeOp 2D -> 1D downcast serves the purpose of flattening 2-D to 1-D
+/// vectors progressively on the way to target llvm.matrix intrinsics.
+/// This iterates over the most major dimension of the 2-D vector and performs
+/// rewrites into:
+///   vector.extract from 2-D + vector.insert_strided_slice offset into 1-D
+class ShapeCastOp2DDownCastRewritePattern
+    : public OpRewritePattern<vector::ShapeCastOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ShapeCastOp op,
+                                PatternRewriter &rewriter) const override {
+    auto sourceVectorType = op.getSourceVectorType();
+    auto resultVectorType = op.getResultVectorType();
+    if (sourceVectorType.getRank() != 2 || resultVectorType.getRank() != 1)
+      return failure();
+
+    auto loc = op.getLoc();
+    Value desc = rewriter.create<arith::ConstantOp>(
+        loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
+    unsigned mostMinorVectorSize = sourceVectorType.getShape()[1];
+    for (int64_t i = 0, e = sourceVectorType.getShape().front(); i != e; ++i) {
+      Value vec = rewriter.create<vector::ExtractOp>(loc, op.getSource(), i);
+      desc = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, vec, desc,
+          /*offsets=*/i * mostMinorVectorSize, /*strides=*/1);
+    }
+    rewriter.replaceOp(op, desc);
+    return success();
+  }
+};
+
+/// ShapeOp 1D -> 2D upcast serves the purpose of unflattening 2-D from 1-D
+/// vectors progressively.
+/// This iterates over the most major dimension of the 2-D vector and performs
+/// rewrites into:
+///   vector.extract_strided_slice from 1-D + vector.insert into 2-D
+/// Note that 1-D extract_strided_slice are lowered to efficient vector.shuffle.
+class ShapeCastOp2DUpCastRewritePattern
+    : public OpRewritePattern<vector::ShapeCastOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ShapeCastOp op,
+                                PatternRewriter &rewriter) const override {
+    auto sourceVectorType = op.getSourceVectorType();
+    auto resultVectorType = op.getResultVectorType();
+    if (sourceVectorType.getRank() != 1 || resultVectorType.getRank() != 2)
+      return failure();
+
+    auto loc = op.getLoc();
+    Value desc = rewriter.create<arith::ConstantOp>(
+        loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
+    unsigned mostMinorVectorSize = resultVectorType.getShape()[1];
+    for (int64_t i = 0, e = resultVectorType.getShape().front(); i != e; ++i) {
+      Value vec = rewriter.create<vector::ExtractStridedSliceOp>(
+          loc, op.getSource(), /*offsets=*/i * mostMinorVectorSize,
+          /*sizes=*/mostMinorVectorSize,
+          /*strides=*/1);
+      desc = rewriter.create<vector::InsertOp>(loc, vec, desc, i);
+    }
+    rewriter.replaceOp(op, desc);
+    return success();
+  }
+};
+
+// We typically should not lower general shape cast operations into data
+// movement instructions, since the assumption is that these casts are
+// optimized away during progressive lowering. For completeness, however,
+// we fall back to a reference implementation that moves all elements
+// into the right place if we get here.
+class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ShapeCastOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto sourceVectorType = op.getSourceVectorType();
+    auto resultVectorType = op.getResultVectorType();
+
+    // Special case 2D / 1D lowerings with better implementations.
+    // TODO: make is ND / 1D to allow generic ND -> 1D -> MD.
+    int64_t srcRank = sourceVectorType.getRank();
+    int64_t resRank = resultVectorType.getRank();
+    if ((srcRank == 2 && resRank == 1) || (srcRank == 1 && resRank == 2))
+      return failure();
+
+    // Generic ShapeCast lowering path goes all the way down to unrolled scalar
+    // extract/insert chains.
+    // TODO: consider evolving the semantics to only allow 1D source or dest and
+    // drop this potentially very expensive lowering.
+    // Compute number of elements involved in the reshape.
+    int64_t numElts = 1;
+    for (int64_t r = 0; r < srcRank; r++)
+      numElts *= sourceVectorType.getDimSize(r);
+    // Replace with data movement operations:
+    //    x[0,0,0] = y[0,0]
+    //    x[0,0,1] = y[0,1]
+    //    x[0,1,0] = y[0,2]
+    // etc., incrementing the two index vectors "row-major"
+    // within the source and result shape.
+    SmallVector<int64_t> srcIdx(srcRank);
+    SmallVector<int64_t> resIdx(resRank);
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
+    for (int64_t i = 0; i < numElts; i++) {
+      if (i != 0) {
+        incIdx(srcIdx, sourceVectorType, srcRank - 1);
+        incIdx(resIdx, resultVectorType, resRank - 1);
+      }
+      Value e = rewriter.create<vector::ExtractOp>(loc, op.getSource(), srcIdx);
+      result = rewriter.create<vector::InsertOp>(loc, e, result, resIdx);
+    }
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+private:
+  static void incIdx(SmallVector<int64_t> &idx, VectorType tp, int64_t r) {
+    assert(0 <= r && r < tp.getRank());
+    if (++idx[r] == tp.getDimSize(r)) {
+      idx[r] = 0;
+      incIdx(idx, tp, r - 1);
+    }
+  }
+};
+} // namespace
+
+void mlir::vector::populateVectorShapeCastLoweringPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<ShapeCastOp2DDownCastRewritePattern,
+               ShapeCastOp2DUpCastRewritePattern, ShapeCastOpRewritePattern>(
+      patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferPermutationMapRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
similarity index 57%
rename from mlir/lib/Dialect/Vector/Transforms/VectorTransferPermutationMapRewritePatterns.cpp
rename to mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
index 68d9a349478bf..c2ce9aa10a850 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferPermutationMapRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
@@ -14,7 +14,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 
 using namespace mlir;
@@ -46,6 +46,11 @@ static Value extendVectorRank(OpBuilder &builder, Location loc, Value vec,
   return builder.create<vector::BroadcastOp>(loc, newVecType, vec);
 }
 
+//===----------------------------------------------------------------------===//
+// populateVectorTransferPermutationMapLoweringPatterns
+//===----------------------------------------------------------------------===//
+
+namespace {
 /// Lower transfer_read op with permutation into a transfer_read with a
 /// permutation map composed of leading zeros followed by a minor identiy +
 /// vector.transpose op.
@@ -332,6 +337,8 @@ struct TransferOpReduceRank : public OpRewritePattern<vector::TransferReadOp> {
   }
 };
 
+} // namespace
+
 void mlir::vector::populateVectorTransferPermutationMapLoweringPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns
@@ -339,3 +346,239 @@ void mlir::vector::populateVectorTransferPermutationMapLoweringPatterns(
            TransferOpReduceRank, TransferWriteNonPermutationLowering>(
           patterns.getContext(), benefit);
 }
+
+//===----------------------------------------------------------------------===//
+// populateVectorTransferLoweringPatterns
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Progressive lowering of transfer_read. This pattern supports lowering of
+/// `vector.transfer_read` to a combination of `vector.load` and
+/// `vector.broadcast` if all of the following hold:
+/// - Stride of most minor memref dimension must be 1.
+/// - Out-of-bounds masking is not required.
+/// - If the memref's element type is a vector type then it coincides with the
+///   result type.
+/// - The permutation map doesn't perform permutation (broadcasting is allowed).
+struct TransferReadToVectorLoadLowering
+    : public OpRewritePattern<vector::TransferReadOp> {
+  TransferReadToVectorLoadLowering(MLIRContext *context,
+                                   std::optional<unsigned> maxRank,
+                                   PatternBenefit benefit = 1)
+      : OpRewritePattern<vector::TransferReadOp>(context, benefit),
+        maxTransferRank(maxRank) {}
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp read,
+                                PatternRewriter &rewriter) const override {
+    if (maxTransferRank && read.getVectorType().getRank() > *maxTransferRank)
+      return failure();
+
+    SmallVector<unsigned> broadcastedDims;
+    // Permutations are handled by VectorToSCF or
+    // populateVectorTransferPermutationMapLoweringPatterns.
+    // We let the 0-d corner case pass-through as it is supported.
+    if (!read.getPermutationMap().isMinorIdentityWithBroadcasting(
+            &broadcastedDims))
+      return failure();
+
+    auto memRefType = read.getShapedType().dyn_cast<MemRefType>();
+    if (!memRefType)
+      return failure();
+
+    // Non-unit strides are handled by VectorToSCF.
+    if (!vector::isLastMemrefDimUnitStride(memRefType))
+      return failure();
+
+    // If there is broadcasting involved then we first load the unbroadcasted
+    // vector, and then broadcast it with `vector.broadcast`.
+    ArrayRef<int64_t> vectorShape = read.getVectorType().getShape();
+    SmallVector<int64_t> unbroadcastedVectorShape(vectorShape.begin(),
+                                                  vectorShape.end());
+    for (unsigned i : broadcastedDims)
+      unbroadcastedVectorShape[i] = 1;
+    VectorType unbroadcastedVectorType = VectorType::get(
+        unbroadcastedVectorShape, read.getVectorType().getElementType());
+
+    // `vector.load` supports vector types as memref's elements only when the
+    // resulting vector type is the same as the element type.
+    auto memrefElTy = memRefType.getElementType();
+    if (memrefElTy.isa<VectorType>() && memrefElTy != unbroadcastedVectorType)
+      return failure();
+
+    // Otherwise, element types of the memref and the vector must match.
+    if (!memrefElTy.isa<VectorType>() &&
+        memrefElTy != read.getVectorType().getElementType())
+      return failure();
+
+    // Out-of-bounds dims are handled by MaterializeTransferMask.
+    if (read.hasOutOfBoundsDim())
+      return failure();
+
+    // Create vector load op.
+    Operation *loadOp;
+    if (read.getMask()) {
+      Value fill = rewriter.create<vector::SplatOp>(
+          read.getLoc(), unbroadcastedVectorType, read.getPadding());
+      loadOp = rewriter.create<vector::MaskedLoadOp>(
+          read.getLoc(), unbroadcastedVectorType, read.getSource(),
+          read.getIndices(), read.getMask(), fill);
+    } else {
+      loadOp = rewriter.create<vector::LoadOp>(
+          read.getLoc(), unbroadcastedVectorType, read.getSource(),
+          read.getIndices());
+    }
+
+    // Insert a broadcasting op if required.
+    if (!broadcastedDims.empty()) {
+      rewriter.replaceOpWithNewOp<vector::BroadcastOp>(
+          read, read.getVectorType(), loadOp->getResult(0));
+    } else {
+      rewriter.replaceOp(read, loadOp->getResult(0));
+    }
+
+    return success();
+  }
+
+  std::optional<unsigned> maxTransferRank;
+};
+
+/// Replace a 0-d vector.load with a memref.load + vector.broadcast.
+// TODO: we shouldn't cross the vector/scalar domains just for this
+// but atm we lack the infra to avoid it. Possible solutions include:
+// - go directly to LLVM + bitcast
+// - introduce a bitcast op and likely a new pointer dialect
+// - let memref.load/store additionally support the 0-d vector case
+// There are still deeper data layout issues lingering even in this
+// trivial case (for architectures for which this matters).
+struct VectorLoadToMemrefLoadLowering
+    : public OpRewritePattern<vector::LoadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::LoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    auto vecType = loadOp.getVectorType();
+    if (vecType.getNumElements() != 1)
+      return failure();
+    auto memrefLoad = rewriter.create<memref::LoadOp>(
+        loadOp.getLoc(), loadOp.getBase(), loadOp.getIndices());
+    rewriter.replaceOpWithNewOp<vector::BroadcastOp>(loadOp, vecType,
+                                                     memrefLoad);
+    return success();
+  }
+};
+
+/// Replace a 0-d vector.store with a vector.extractelement + memref.store.
+struct VectorStoreToMemrefStoreLowering
+    : public OpRewritePattern<vector::StoreOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::StoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    auto vecType = storeOp.getVectorType();
+    if (vecType.getNumElements() != 1)
+      return failure();
+    Value extracted;
+    if (vecType.getRank() == 0) {
+      // TODO: Unifiy once ExtractOp supports 0-d vectors.
+      extracted = rewriter.create<vector::ExtractElementOp>(
+          storeOp.getLoc(), storeOp.getValueToStore());
+    } else {
+      SmallVector<int64_t> indices(vecType.getRank(), 0);
+      extracted = rewriter.create<vector::ExtractOp>(
+          storeOp.getLoc(), storeOp.getValueToStore(), indices);
+    }
+
+    rewriter.replaceOpWithNewOp<memref::StoreOp>(
+        storeOp, extracted, storeOp.getBase(), storeOp.getIndices());
+    return success();
+  }
+};
+
+/// Progressive lowering of transfer_write. This pattern supports lowering of
+/// `vector.transfer_write` to `vector.store` if all of the following hold:
+/// - Stride of most minor memref dimension must be 1.
+/// - Out-of-bounds masking is not required.
+/// - If the memref's element type is a vector type then it coincides with the
+///   type of the written value.
+/// - The permutation map is the minor identity map (neither permutation nor
+///   broadcasting is allowed).
+struct TransferWriteToVectorStoreLowering
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  TransferWriteToVectorStoreLowering(MLIRContext *context,
+                                     std::optional<unsigned> maxRank,
+                                     PatternBenefit benefit = 1)
+      : OpRewritePattern<vector::TransferWriteOp>(context, benefit),
+        maxTransferRank(maxRank) {}
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp write,
+                                PatternRewriter &rewriter) const override {
+    if (maxTransferRank && write.getVectorType().getRank() > *maxTransferRank)
+      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
+        diag << "rank exceeds maxTransferRank: " << write;
+      });
+
+    // Permutations are handled by VectorToSCF or
+    // populateVectorTransferPermutationMapLoweringPatterns.
+    if ( // pass-through for the 0-d corner case.
+        !write.getPermutationMap().isMinorIdentity())
+      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
+        diag << "permutation map is not minor identity: " << write;
+      });
+
+    auto memRefType = write.getShapedType().dyn_cast<MemRefType>();
+    if (!memRefType)
+      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
+        diag << "not a memref type: " << write;
+      });
+
+    // Non-unit strides are handled by VectorToSCF.
+    if (!vector::isLastMemrefDimUnitStride(memRefType))
+      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
+        diag << "most minor stride is not 1: " << write;
+      });
+
+    // `vector.store` supports vector types as memref's elements only when the
+    // type of the vector value being written is the same as the element type.
+    auto memrefElTy = memRefType.getElementType();
+    if (memrefElTy.isa<VectorType>() && memrefElTy != write.getVectorType())
+      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
+        diag << "elemental type mismatch: " << write;
+      });
+
+    // Otherwise, element types of the memref and the vector must match.
+    if (!memrefElTy.isa<VectorType>() &&
+        memrefElTy != write.getVectorType().getElementType())
+      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
+        diag << "elemental type mismatch: " << write;
+      });
+
+    // Out-of-bounds dims are handled by MaterializeTransferMask.
+    if (write.hasOutOfBoundsDim())
+      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
+        diag << "out of bounds dim: " << write;
+      });
+    if (write.getMask()) {
+      rewriter.replaceOpWithNewOp<vector::MaskedStoreOp>(
+          write, write.getSource(), write.getIndices(), write.getMask(),
+          write.getVector());
+    } else {
+      rewriter.replaceOpWithNewOp<vector::StoreOp>(
+          write, write.getVector(), write.getSource(), write.getIndices());
+    }
+    return success();
+  }
+
+  std::optional<unsigned> maxTransferRank;
+};
+} // namespace
+
+void mlir::vector::populateVectorTransferLoweringPatterns(
+    RewritePatternSet &patterns, std::optional<unsigned> maxTransferRank,
+    PatternBenefit benefit) {
+  patterns.add<TransferReadToVectorLoadLowering,
+               TransferWriteToVectorStoreLowering>(patterns.getContext(),
+                                                   maxTransferRank, benefit);
+  patterns
+      .add<VectorLoadToMemrefLoadLowering, VectorStoreToMemrefStoreLowering>(
+          patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
new file mode 100644
index 0000000000000..f6e8b0c445c99
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
@@ -0,0 +1,210 @@
+//===- LowerVectorTranspose.cpp - Lower 'vector.transpose' operation ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites and utilities to lower the
+// 'vector.transpose' operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/VectorInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+
+#define DEBUG_TYPE "vector-shape-cast-lowering"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+/// Given a 'transpose' pattern, prune the rightmost dimensions that are not
+/// transposed.
+static void pruneNonTransposedDims(ArrayRef<int64_t> transpose,
+                                   SmallVectorImpl<int64_t> &result) {
+  size_t numTransposedDims = transpose.size();
+  for (size_t transpDim : llvm::reverse(transpose)) {
+    if (transpDim != numTransposedDims - 1)
+      break;
+    numTransposedDims--;
+  }
+
+  result.append(transpose.begin(), transpose.begin() + numTransposedDims);
+}
+
+namespace {
+/// Progressive lowering of TransposeOp.
+/// One:
+///   %x = vector.transpose %y, [1, 0]
+/// is replaced by:
+///   %z = arith.constant dense<0.000000e+00>
+///   %0 = vector.extract %y[0, 0]
+///   %1 = vector.insert %0, %z [0, 0]
+///   ..
+///   %x = vector.insert .., .. [.., ..]
+class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  TransposeOpLowering(vector::VectorTransformsOptions vectorTransformOptions,
+                      MLIRContext *context, PatternBenefit benefit = 1)
+      : OpRewritePattern<vector::TransposeOp>(context, benefit),
+        vectorTransformOptions(vectorTransformOptions) {}
+
+  LogicalResult matchAndRewrite(vector::TransposeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    Value input = op.getVector();
+    VectorType inputType = op.getSourceVectorType();
+    VectorType resType = op.getResultVectorType();
+
+    // Set up convenience transposition table.
+    SmallVector<int64_t> transp;
+    for (auto attr : op.getTransp())
+      transp.push_back(attr.cast<IntegerAttr>().getInt());
+
+    if (vectorTransformOptions.vectorTransposeLowering ==
+            vector::VectorTransposeLowering::Shuffle &&
+        resType.getRank() == 2 && transp[0] == 1 && transp[1] == 0)
+      return rewriter.notifyMatchFailure(
+          op, "Options specifies lowering to shuffle");
+
+    // Handle a true 2-D matrix transpose differently when requested.
+    if (vectorTransformOptions.vectorTransposeLowering ==
+            vector::VectorTransposeLowering::Flat &&
+        resType.getRank() == 2 && transp[0] == 1 && transp[1] == 0) {
+      Type flattenedType =
+          VectorType::get(resType.getNumElements(), resType.getElementType());
+      auto matrix =
+          rewriter.create<vector::ShapeCastOp>(loc, flattenedType, input);
+      auto rows = rewriter.getI32IntegerAttr(resType.getShape()[0]);
+      auto columns = rewriter.getI32IntegerAttr(resType.getShape()[1]);
+      Value trans = rewriter.create<vector::FlatTransposeOp>(
+          loc, flattenedType, matrix, rows, columns);
+      rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, resType, trans);
+      return success();
+    }
+
+    // Generate unrolled extract/insert ops. We do not unroll the rightmost
+    // (i.e., highest-order) dimensions that are not transposed and leave them
+    // in vector form to improve performance. Therefore, we prune those
+    // dimensions from the shape/transpose data structures used to generate the
+    // extract/insert ops.
+    SmallVector<int64_t> prunedTransp;
+    pruneNonTransposedDims(transp, prunedTransp);
+    size_t numPrunedDims = transp.size() - prunedTransp.size();
+    auto prunedInShape = inputType.getShape().drop_back(numPrunedDims);
+    auto prunedInStrides = computeStrides(prunedInShape);
+
+    // Generates the extract/insert operations for every scalar/vector element
+    // of the leftmost transposed dimensions. We traverse every transpose
+    // element using a linearized index that we delinearize to generate the
+    // appropriate indices for the extract/insert operations.
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, resType, rewriter.getZeroAttr(resType));
+    int64_t numTransposedElements = ShapedType::getNumElements(prunedInShape);
+
+    for (int64_t linearIdx = 0; linearIdx < numTransposedElements;
+         ++linearIdx) {
+      auto extractIdxs = delinearize(linearIdx, prunedInStrides);
+      SmallVector<int64_t> insertIdxs(extractIdxs);
+      applyPermutationToVector(insertIdxs, prunedTransp);
+      Value extractOp =
+          rewriter.create<vector::ExtractOp>(loc, input, extractIdxs);
+      result =
+          rewriter.create<vector::InsertOp>(loc, extractOp, result, insertIdxs);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformOptions;
+};
+
+/// Rewrite a 2-D vector.transpose as a sequence of:
+///   vector.shape_cast 2D -> 1D
+///   vector.shuffle
+///   vector.shape_cast 1D -> 2D
+class TransposeOp2DToShuffleLowering
+    : public OpRewritePattern<vector::TransposeOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  TransposeOp2DToShuffleLowering(
+      vector::VectorTransformsOptions vectorTransformOptions,
+      MLIRContext *context, PatternBenefit benefit = 1)
+      : OpRewritePattern<vector::TransposeOp>(context, benefit),
+        vectorTransformOptions(vectorTransformOptions) {}
+
+  LogicalResult matchAndRewrite(vector::TransposeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    VectorType srcType = op.getSourceVectorType();
+    if (srcType.getRank() != 2)
+      return rewriter.notifyMatchFailure(op, "Not a 2D transpose");
+
+    SmallVector<int64_t> transp;
+    for (auto attr : op.getTransp())
+      transp.push_back(attr.cast<IntegerAttr>().getInt());
+    if (transp[0] != 1 && transp[1] != 0)
+      return rewriter.notifyMatchFailure(op, "Not a 2D transpose permutation");
+
+    if (vectorTransformOptions.vectorTransposeLowering !=
+        VectorTransposeLowering::Shuffle)
+      return rewriter.notifyMatchFailure(op, "Options do not ask for Shuffle");
+
+    int64_t m = srcType.getShape().front(), n = srcType.getShape().back();
+    Value casted = rewriter.create<vector::ShapeCastOp>(
+        loc, VectorType::get({m * n}, srcType.getElementType()),
+        op.getVector());
+    SmallVector<int64_t> mask;
+    mask.reserve(m * n);
+    for (int64_t j = 0; j < n; ++j)
+      for (int64_t i = 0; i < m; ++i)
+        mask.push_back(i * n + j);
+
+    Value shuffled =
+        rewriter.create<vector::ShuffleOp>(loc, casted, casted, mask);
+    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(
+        op, op.getResultVectorType(), shuffled);
+
+    return success();
+  }
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformOptions;
+};
+} // namespace
+
+void mlir::vector::populateVectorTransposeLoweringPatterns(
+    RewritePatternSet &patterns, VectorTransformsOptions options,
+    PatternBenefit benefit) {
+  patterns.add<TransposeOpLowering, TransposeOp2DToShuffleLowering>(
+      options, patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index 38062b9893f1a..b0690f63422d9 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/BuiltinOps.h"
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
index ee23b5494f707..caf5822256bc6 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <type_traits>
 #include <optional>
+#include <type_traits>
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -92,11 +92,11 @@ static Value createInBoundsCond(RewriterBase &b,
 }
 
 /// Split a vector.transfer operation into an in-bounds (i.e., no out-of-bounds
-/// masking) fastpath and a slowpath.
+/// masking) fast path and a slow path.
 /// If `ifOp` is not null and the result is `success, the `ifOp` points to the
 /// newly created conditional upon function return.
-/// To accomodate for the fact that the original vector.transfer indexing may be
-/// arbitrary and the slow path indexes @[0...0] in the temporary buffer, the
+/// To accommodate for the fact that the original vector.transfer indexing may
+/// be arbitrary and the slow path indexes @[0...0] in the temporary buffer, the
 /// scf.if op returns a view and values of type index.
 /// At this time, only vector.transfer_read case is implemented.
 ///
@@ -107,11 +107,11 @@ static Value createInBoundsCond(RewriterBase &b,
 /// is transformed into:
 /// ```
 ///    %1:3 = scf.if (%inBounds) {
-///      // fastpath, direct cast
+///      // fast path, direct cast
 ///      memref.cast %A: memref<A...> to compatibleMemRefType
 ///      scf.yield %view : compatibleMemRefType, index, index
 ///    } else {
-///      // slowpath, not in-bounds vector.transfer or linalg.copy.
+///      // slow path, not in-bounds vector.transfer or linalg.copy.
 ///      memref.cast %alloc: memref<B...> to compatibleMemRefType
 ///      scf.yield %4 : compatibleMemRefType, index, index
 //     }
@@ -172,12 +172,10 @@ static MemRefType getCastCompatibleMemRefType(MemRefType aT, MemRefType bT) {
   for (int64_t idx = 0, e = aT.getRank(); idx < e; ++idx) {
     resShape[idx] =
         (aShape[idx] == bShape[idx]) ? aShape[idx] : ShapedType::kDynamic;
-    resStrides[idx] = (aStrides[idx] == bStrides[idx])
-                          ? aStrides[idx]
-                          : ShapedType::kDynamic;
+    resStrides[idx] =
+        (aStrides[idx] == bStrides[idx]) ? aStrides[idx] : ShapedType::kDynamic;
   }
-  resOffset =
-      (aOffset == bOffset) ? aOffset : ShapedType::kDynamic;
+  resOffset = (aOffset == bOffset) ? aOffset : ShapedType::kDynamic;
   return MemRefType::get(
       resShape, aT.getElementType(),
       StridedLayoutAttr::get(aT.getContext(), resOffset, resStrides));
@@ -634,7 +632,34 @@ LogicalResult mlir::vector::splitFullAndPartialTransfer(
   return success();
 }
 
-LogicalResult mlir::vector::VectorTransferFullPartialRewriter::matchAndRewrite(
+namespace {
+/// Apply `splitFullAndPartialTransfer` selectively via a pattern. This pattern
+/// may take an extra filter to perform selection at a finer granularity.
+struct VectorTransferFullPartialRewriter : public RewritePattern {
+  using FilterConstraintType =
+      std::function<LogicalResult(VectorTransferOpInterface op)>;
+
+  explicit VectorTransferFullPartialRewriter(
+      MLIRContext *context,
+      VectorTransformsOptions options = VectorTransformsOptions(),
+      FilterConstraintType filter =
+          [](VectorTransferOpInterface op) { return success(); },
+      PatternBenefit benefit = 1)
+      : RewritePattern(MatchAnyOpTypeTag(), benefit, context), options(options),
+        filter(std::move(filter)) {}
+
+  /// Performs the rewrite.
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  VectorTransformsOptions options;
+  FilterConstraintType filter;
+};
+
+} // namespace
+
+LogicalResult VectorTransferFullPartialRewriter::matchAndRewrite(
     Operation *op, PatternRewriter &rewriter) const {
   auto xferOp = dyn_cast<VectorTransferOpInterface>(op);
   if (!xferOp || failed(splitFullAndPartialTransferPrecondition(xferOp)) ||
@@ -642,3 +667,9 @@ LogicalResult mlir::vector::VectorTransferFullPartialRewriter::matchAndRewrite(
     return failure();
   return splitFullAndPartialTransfer(rewriter, xferOp, options);
 }
+
+void mlir::vector::populateVectorTransferFullPartialPatterns(
+    RewritePatternSet &patterns, const VectorTransformsOptions &options) {
+  patterns.add<VectorTransferFullPartialRewriter>(patterns.getContext(),
+                                                  options);
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index fe59143ebd55f..20fc59e874ab6 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -51,102 +51,6 @@
 using namespace mlir;
 using namespace mlir::vector;
 
-// Helper to find an index in an affine map.
-static std::optional<int64_t> getResultIndex(AffineMap map, int64_t index) {
-  for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
-    int64_t idx = map.getDimPosition(i);
-    if (idx == index)
-      return i;
-  }
-  return std::nullopt;
-}
-
-// Helper to construct iterator types with one index removed.
-static SmallVector<Attribute> adjustIter(ArrayAttr iteratorTypes,
-                                         int64_t index) {
-  SmallVector<Attribute> results;
-  for (const auto &it : llvm::enumerate(iteratorTypes)) {
-    int64_t idx = it.index();
-    if (idx == index)
-      continue;
-    results.push_back(it.value());
-  }
-  return results;
-}
-
-// Helper to construct an affine map with one index removed.
-static AffineMap adjustMap(AffineMap map, int64_t index,
-                           PatternRewriter &rewriter) {
-  auto *ctx = rewriter.getContext();
-  SmallVector<AffineExpr> results;
-  for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
-    int64_t idx = map.getDimPosition(i);
-    if (idx == index)
-      continue;
-    // Re-insert remaining indices, but renamed when occurring
-    // after the removed index.
-    auto targetExpr = getAffineDimExpr(idx < index ? idx : idx - 1, ctx);
-    results.push_back(targetExpr);
-  }
-  return AffineMap::get(map.getNumDims() - 1, 0, results, ctx);
-}
-
-// Helper method to possibly drop a dimension in a load.
-// TODO
-static Value reshapeLoad(Location loc, Value val, VectorType type,
-                         int64_t index, int64_t pos,
-                         PatternRewriter &rewriter) {
-  if (index == -1)
-    return val;
-  Type lowType = VectorType::Builder(type).dropDim(0);
-  // At extraction dimension?
-  if (index == 0) {
-    auto posAttr = rewriter.getI64ArrayAttr(pos);
-    return rewriter.create<vector::ExtractOp>(loc, lowType, val, posAttr);
-  }
-  // Unroll leading dimensions.
-  VectorType vType = lowType.cast<VectorType>();
-  Type resType = VectorType::Builder(type).dropDim(index);
-  auto resVectorType = resType.cast<VectorType>();
-  Value result = rewriter.create<arith::ConstantOp>(
-      loc, resVectorType, rewriter.getZeroAttr(resVectorType));
-  for (int64_t d = 0, e = resVectorType.getDimSize(0); d < e; d++) {
-    auto posAttr = rewriter.getI64ArrayAttr(d);
-    Value ext = rewriter.create<vector::ExtractOp>(loc, vType, val, posAttr);
-    Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter);
-    result = rewriter.create<vector::InsertOp>(loc, resVectorType, load, result,
-                                               posAttr);
-  }
-  return result;
-}
-
-// Helper method to possibly drop a dimension in a store.
-// TODO
-static Value reshapeStore(Location loc, Value val, Value result,
-                          VectorType type, int64_t index, int64_t pos,
-                          PatternRewriter &rewriter) {
-  // Unmodified?
-  if (index == -1)
-    return val;
-  // At insertion dimension?
-  if (index == 0) {
-    auto posAttr = rewriter.getI64ArrayAttr(pos);
-    return rewriter.create<vector::InsertOp>(loc, type, val, result, posAttr);
-  }
-  // Unroll leading dimensions.
-  Type lowType = VectorType::Builder(type).dropDim(0);
-  VectorType vType = lowType.cast<VectorType>();
-  Type insType = VectorType::Builder(vType).dropDim(0);
-  for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) {
-    auto posAttr = rewriter.getI64ArrayAttr(d);
-    Value ext = rewriter.create<vector::ExtractOp>(loc, vType, result, posAttr);
-    Value ins = rewriter.create<vector::ExtractOp>(loc, insType, val, posAttr);
-    Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter);
-    result = rewriter.create<vector::InsertOp>(loc, type, sto, result, posAttr);
-  }
-  return result;
-}
-
 template <typename IntType>
 static SmallVector<IntType> extractVector(ArrayAttr arrayAttr) {
   return llvm::to_vector<4>(llvm::map_range(
@@ -154,61 +58,11 @@ static SmallVector<IntType> extractVector(ArrayAttr arrayAttr) {
       [](IntegerAttr attr) { return static_cast<IntType>(attr.getInt()); }));
 }
 
-/// Helper to create arithmetic operation associated with a kind of contraction.
-static std::optional<Value>
-createContractArithOp(Location loc, Value x, Value y, Value acc,
-                      vector::CombiningKind kind, PatternRewriter &rewriter,
-                      bool isInt, Value mask = Value()) {
-  using vector::CombiningKind;
-  Value mul;
-
-  if (isInt) {
-    if (kind == CombiningKind::MINF || kind == CombiningKind::MAXF)
-      // Only valid for floating point types.
-      return std::nullopt;
-    mul = rewriter.create<arith::MulIOp>(loc, x, y);
-  } else {
-    // Float case.
-    if (kind == CombiningKind::AND || kind == CombiningKind::MINUI ||
-        kind == CombiningKind::MINSI || kind == CombiningKind::MAXUI ||
-        kind == CombiningKind::MAXSI || kind == CombiningKind::OR ||
-        kind == CombiningKind::XOR)
-      // Only valid for integer types.
-      return std::nullopt;
-    // Special case for fused multiply-add.
-    if (acc && acc.getType().isa<VectorType>() && kind == CombiningKind::ADD) {
-      Value fma = rewriter.create<vector::FMAOp>(loc, x, y, acc);
-      if (mask)
-        // The fma op doesn't need explicit masking. However, fma ops used in
-        // reductions must preserve previous 'acc' values for masked-out lanes.
-        fma = selectPassthru(rewriter, mask, fma, acc);
-      return fma;
-    }
-    mul = rewriter.create<arith::MulFOp>(loc, x, y);
-  }
-
-  if (!acc)
-    return std::optional<Value>(mul);
-
-  return makeArithReduction(rewriter, loc, kind, mul, acc, mask);
-}
-
-/// Return the positions of the reductions in the given map.
-static SmallVector<int64_t> getReductionIndex(AffineMap map,
-                                              ArrayAttr iteratorTypes) {
-  SmallVector<int64_t> dimsIdx;
-  for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
-    if (isReductionIterator(iteratorTypes[map.getDimPosition(i)]))
-      dimsIdx.push_back(i);
-  }
-  return dimsIdx;
-}
-
-/// Look for a given dimension in an affine map and return its position. Return
-/// std::nullopt if the dimension is not in the map results.
-static std::optional<unsigned> getDimPosition(AffineMap map, unsigned dim) {
-  for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
-    if (map.getDimPosition(i) == dim)
+// Helper to find an index in an affine map.
+static std::optional<int64_t> getResultIndex(AffineMap map, int64_t index) {
+  for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
+    int64_t idx = map.getDimPosition(i);
+    if (idx == index)
       return i;
   }
   return std::nullopt;
@@ -264,735 +118,6 @@ struct ShapeCastOpFolder : public OpRewritePattern<vector::ShapeCastOp> {
   }
 };
 
-/// Progressive lowering of BroadcastOp.
-class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::BroadcastOp op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-    VectorType dstType = op.getResultVectorType();
-    VectorType srcType = op.getSourceType().dyn_cast<VectorType>();
-    Type eltType = dstType.getElementType();
-
-    // Scalar to any vector can use splat.
-    if (!srcType) {
-      rewriter.replaceOpWithNewOp<vector::SplatOp>(op, dstType, op.getSource());
-      return success();
-    }
-
-    // Determine rank of source and destination.
-    int64_t srcRank = srcType.getRank();
-    int64_t dstRank = dstType.getRank();
-
-    // Stretching scalar inside vector (e.g. vector<1xf32>) can use splat.
-    if (srcRank <= 1 && dstRank == 1) {
-      Value ext;
-      if (srcRank == 0)
-        ext = rewriter.create<vector::ExtractElementOp>(loc, op.getSource());
-      else
-        ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), 0);
-      rewriter.replaceOpWithNewOp<vector::SplatOp>(op, dstType, ext);
-      return success();
-    }
-
-    // Duplicate this rank.
-    // For example:
-    //   %x = broadcast %y  : k-D to n-D, k < n
-    // becomes:
-    //   %b = broadcast %y  : k-D to (n-1)-D
-    //   %x = [%b,%b,%b,%b] : n-D
-    // becomes:
-    //   %b = [%y,%y]       : (n-1)-D
-    //   %x = [%b,%b,%b,%b] : n-D
-    if (srcRank < dstRank) {
-      // Duplication.
-      VectorType resType =
-          VectorType::get(dstType.getShape().drop_front(), eltType);
-      Value bcst =
-          rewriter.create<vector::BroadcastOp>(loc, resType, op.getSource());
-      Value result = rewriter.create<arith::ConstantOp>(
-          loc, dstType, rewriter.getZeroAttr(dstType));
-      for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d)
-        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
-      rewriter.replaceOp(op, result);
-      return success();
-    }
-
-    // Find non-matching dimension, if any.
-    assert(srcRank == dstRank);
-    int64_t m = -1;
-    for (int64_t r = 0; r < dstRank; r++)
-      if (srcType.getDimSize(r) != dstType.getDimSize(r)) {
-        m = r;
-        break;
-      }
-
-    // All trailing dimensions are the same. Simply pass through.
-    if (m == -1) {
-      rewriter.replaceOp(op, op.getSource());
-      return success();
-    }
-
-    // Any non-matching dimension forces a stretch along this rank.
-    // For example:
-    //   %x = broadcast %y : vector<4x1x2xf32> to vector<4x2x2xf32>
-    // becomes:
-    //   %a = broadcast %y[0] : vector<1x2xf32> to vector<2x2xf32>
-    //   %b = broadcast %y[1] : vector<1x2xf32> to vector<2x2xf32>
-    //   %c = broadcast %y[2] : vector<1x2xf32> to vector<2x2xf32>
-    //   %d = broadcast %y[3] : vector<1x2xf32> to vector<2x2xf32>
-    //   %x = [%a,%b,%c,%d]
-    // becomes:
-    //   %u = broadcast %y[0][0] : vector<2xf32> to vector <2x2xf32>
-    //   %v = broadcast %y[1][0] : vector<2xf32> to vector <2x2xf32>
-    //   %a = [%u, %v]
-    //   ..
-    //   %x = [%a,%b,%c,%d]
-    VectorType resType =
-        VectorType::get(dstType.getShape().drop_front(), eltType);
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, dstType, rewriter.getZeroAttr(dstType));
-    if (m == 0) {
-      // Stetch at start.
-      Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), 0);
-      Value bcst = rewriter.create<vector::BroadcastOp>(loc, resType, ext);
-      for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d)
-        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
-    } else {
-      // Stetch not at start.
-      for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d) {
-        Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), d);
-        Value bcst = rewriter.create<vector::BroadcastOp>(loc, resType, ext);
-        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
-      }
-    }
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
-/// Given a 'transpose' pattern, prune the rightmost dimensions that are not
-/// transposed.
-void pruneNonTransposedDims(ArrayRef<int64_t> transpose,
-                            SmallVectorImpl<int64_t> &result) {
-  size_t numTransposedDims = transpose.size();
-  for (size_t transpDim : llvm::reverse(transpose)) {
-    if (transpDim != numTransposedDims - 1)
-      break;
-    numTransposedDims--;
-  }
-
-  result.append(transpose.begin(), transpose.begin() + numTransposedDims);
-}
-
-/// Progressive lowering of TransposeOp.
-/// One:
-///   %x = vector.transpose %y, [1, 0]
-/// is replaced by:
-///   %z = arith.constant dense<0.000000e+00>
-///   %0 = vector.extract %y[0, 0]
-///   %1 = vector.insert %0, %z [0, 0]
-///   ..
-///   %x = vector.insert .., .. [.., ..]
-class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  TransposeOpLowering(vector::VectorTransformsOptions vectorTransformOptions,
-                      MLIRContext *context, PatternBenefit benefit = 1)
-      : OpRewritePattern<vector::TransposeOp>(context, benefit),
-        vectorTransformOptions(vectorTransformOptions) {}
-
-  LogicalResult matchAndRewrite(vector::TransposeOp op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-
-    Value input = op.getVector();
-    VectorType inputType = op.getSourceVectorType();
-    VectorType resType = op.getResultVectorType();
-
-    // Set up convenience transposition table.
-    SmallVector<int64_t> transp;
-    for (auto attr : op.getTransp())
-      transp.push_back(attr.cast<IntegerAttr>().getInt());
-
-    if (vectorTransformOptions.vectorTransposeLowering ==
-            vector::VectorTransposeLowering::Shuffle &&
-        resType.getRank() == 2 && transp[0] == 1 && transp[1] == 0)
-      return rewriter.notifyMatchFailure(
-          op, "Options specifies lowering to shuffle");
-
-    // Handle a true 2-D matrix transpose differently when requested.
-    if (vectorTransformOptions.vectorTransposeLowering ==
-            vector::VectorTransposeLowering::Flat &&
-        resType.getRank() == 2 && transp[0] == 1 && transp[1] == 0) {
-      Type flattenedType =
-          VectorType::get(resType.getNumElements(), resType.getElementType());
-      auto matrix =
-          rewriter.create<vector::ShapeCastOp>(loc, flattenedType, input);
-      auto rows = rewriter.getI32IntegerAttr(resType.getShape()[0]);
-      auto columns = rewriter.getI32IntegerAttr(resType.getShape()[1]);
-      Value trans = rewriter.create<vector::FlatTransposeOp>(
-          loc, flattenedType, matrix, rows, columns);
-      rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, resType, trans);
-      return success();
-    }
-
-    // Generate unrolled extract/insert ops. We do not unroll the rightmost
-    // (i.e., highest-order) dimensions that are not transposed and leave them
-    // in vector form to improve performance. Therefore, we prune those
-    // dimensions from the shape/transpose data structures used to generate the
-    // extract/insert ops.
-    SmallVector<int64_t> prunedTransp;
-    pruneNonTransposedDims(transp, prunedTransp);
-    size_t numPrunedDims = transp.size() - prunedTransp.size();
-    auto prunedInShape = inputType.getShape().drop_back(numPrunedDims);
-    auto prunedInStrides = computeStrides(prunedInShape);
-
-    // Generates the extract/insert operations for every scalar/vector element
-    // of the leftmost transposed dimensions. We traverse every transpose
-    // element using a linearized index that we delinearize to generate the
-    // appropriate indices for the extract/insert operations.
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resType, rewriter.getZeroAttr(resType));
-    int64_t numTransposedElements = ShapedType::getNumElements(prunedInShape);
-
-    for (int64_t linearIdx = 0; linearIdx < numTransposedElements;
-         ++linearIdx) {
-      auto extractIdxs = delinearize(linearIdx, prunedInStrides);
-      SmallVector<int64_t> insertIdxs(extractIdxs);
-      applyPermutationToVector(insertIdxs, prunedTransp);
-      Value extractOp =
-          rewriter.create<vector::ExtractOp>(loc, input, extractIdxs);
-      result =
-          rewriter.create<vector::InsertOp>(loc, extractOp, result, insertIdxs);
-    }
-
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-
-private:
-  /// Options to control the vector patterns.
-  vector::VectorTransformsOptions vectorTransformOptions;
-};
-
-/// Rewrite a 2-D vector.transpose as a sequence of:
-///   vector.shape_cast 2D -> 1D
-///   vector.shuffle
-///   vector.shape_cast 1D -> 2D
-class TransposeOp2DToShuffleLowering
-    : public OpRewritePattern<vector::TransposeOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  TransposeOp2DToShuffleLowering(
-      vector::VectorTransformsOptions vectorTransformOptions,
-      MLIRContext *context, PatternBenefit benefit = 1)
-      : OpRewritePattern<vector::TransposeOp>(context, benefit),
-        vectorTransformOptions(vectorTransformOptions) {}
-
-  LogicalResult matchAndRewrite(vector::TransposeOp op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-
-    VectorType srcType = op.getSourceVectorType();
-    if (srcType.getRank() != 2)
-      return rewriter.notifyMatchFailure(op, "Not a 2D transpose");
-
-    SmallVector<int64_t> transp;
-    for (auto attr : op.getTransp())
-      transp.push_back(attr.cast<IntegerAttr>().getInt());
-    if (transp[0] != 1 && transp[1] != 0)
-      return rewriter.notifyMatchFailure(op, "Not a 2D transpose permutation");
-
-    if (vectorTransformOptions.vectorTransposeLowering !=
-        VectorTransposeLowering::Shuffle)
-      return rewriter.notifyMatchFailure(op, "Options do not ask for Shuffle");
-
-    int64_t m = srcType.getShape().front(), n = srcType.getShape().back();
-    Value casted = rewriter.create<vector::ShapeCastOp>(
-        loc, VectorType::get({m * n}, srcType.getElementType()),
-        op.getVector());
-    SmallVector<int64_t> mask;
-    mask.reserve(m * n);
-    for (int64_t j = 0; j < n; ++j)
-      for (int64_t i = 0; i < m; ++i)
-        mask.push_back(i * n + j);
-
-    Value shuffled =
-        rewriter.create<vector::ShuffleOp>(loc, casted, casted, mask);
-    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(
-        op, op.getResultVectorType(), shuffled);
-
-    return success();
-  }
-
-private:
-  /// Options to control the vector patterns.
-  vector::VectorTransformsOptions vectorTransformOptions;
-};
-
-/// Progressive lowering of OuterProductOp.
-/// One:
-///   %x = vector.outerproduct %lhs, %rhs, %acc
-/// is replaced by:
-///   %z = zero-result
-///   %0 = vector.extract %lhs[0]
-///   %1 = vector.broadcast %0
-///   %2 = vector.extract %acc[0]
-///   %3 = vector.fma %1, %rhs, %2
-///   %4 = vector.insert %3, %z[0]
-///   ..
-///   %x = vector.insert %.., %..[N-1]
-///
-class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::OuterProductOp op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-
-    VectorType lhsType = op.getOperandVectorTypeLHS();
-    VectorType rhsType = op.getOperandTypeRHS().dyn_cast<VectorType>();
-    VectorType resType = op.getResultVectorType();
-    Type eltType = resType.getElementType();
-    bool isInt = eltType.isa<IntegerType, IndexType>();
-    Value acc = (op.getAcc().empty()) ? nullptr : op.getAcc()[0];
-    vector::CombiningKind kind = op.getKind();
-
-    // Vector mask setup.
-    OpBuilder::InsertionGuard guard(rewriter);
-    auto maskableOp = cast<vector::MaskableOpInterface>(op.getOperation());
-    Operation *rootOp;
-    Value mask;
-    if (maskableOp.isMasked()) {
-      rewriter.setInsertionPoint(maskableOp.getMaskingOp());
-      rootOp = maskableOp.getMaskingOp();
-      mask = maskableOp.getMaskingOp().getMask();
-    } else {
-      rootOp = op;
-    }
-
-    if (!rhsType) {
-      // Special case: AXPY operation.
-      Value b = rewriter.create<vector::BroadcastOp>(loc, lhsType, op.getRhs());
-      std::optional<Value> mult = createContractArithOp(
-          loc, op.getLhs(), b, acc, kind, rewriter, isInt, mask);
-      if (!mult.has_value())
-        return failure();
-      rewriter.replaceOp(rootOp, *mult);
-      return success();
-    }
-
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resType, rewriter.getZeroAttr(resType));
-    for (int64_t d = 0, e = resType.getDimSize(0); d < e; ++d) {
-      auto pos = rewriter.getI64ArrayAttr(d);
-      Value x = rewriter.create<vector::ExtractOp>(loc, op.getLhs(), pos);
-      Value a = rewriter.create<vector::BroadcastOp>(loc, rhsType, x);
-      Value r = nullptr;
-      if (acc)
-        r = rewriter.create<vector::ExtractOp>(loc, acc, pos);
-      Value extrMask;
-      if (mask)
-        extrMask = rewriter.create<vector::ExtractOp>(loc, mask, pos);
-
-      std::optional<Value> m = createContractArithOp(
-          loc, a, op.getRhs(), r, kind, rewriter, isInt, extrMask);
-      if (!m.has_value())
-        return failure();
-      result = rewriter.create<vector::InsertOp>(loc, resType, *m, result, pos);
-    }
-
-    rewriter.replaceOp(rootOp, result);
-    return success();
-  }
-};
-
-/// Lower vector.contract with all size one reduction dimensions to
-/// elementwise ops when possible.
-struct ContractOpToElementwise
-    : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern::OpRewritePattern;
-  using FilterConstraintType =
-      std::function<LogicalResult(vector::ContractionOp op)>;
-  static LogicalResult defaultFilter(vector::ContractionOp op) {
-    return success();
-  }
-  ContractOpToElementwise(
-      vector::VectorTransformsOptions vectorTransformOptions,
-      MLIRContext *context, PatternBenefit benefit = 1,
-      const FilterConstraintType &constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
-        vectorTransformOptions(vectorTransformOptions), filter(defaultFilter) {}
-
-  LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
-                                PatternRewriter &rewriter) const override {
-    // TODO: Support vector.mask.
-    auto maskableOp = cast<MaskableOpInterface>(contractOp.getOperation());
-    if (maskableOp.isMasked())
-      return failure();
-
-    // TODO: Remove native masks from contraction op?
-    if (!contractOp.getMasks().empty())
-      return failure();
-
-    if (failed(filter(contractOp)))
-      return failure();
-
-    if (vectorTransformOptions.vectorContractLowering !=
-        vector::VectorContractLowering::ParallelArith)
-      return failure();
-
-    ArrayRef<int64_t> lhsShape = contractOp.getLhsType().getShape();
-    ArrayRef<int64_t> rhsShape = contractOp.getRhsType().getShape();
-    AffineMap lhsMap = contractOp.getIndexingMapsArray()[0];
-    AffineMap rhsMap = contractOp.getIndexingMapsArray()[1];
-    SmallVector<int64_t> lhsReductionDims =
-        getReductionIndex(lhsMap, contractOp.getIteratorTypes());
-    SmallVector<int64_t> rhsReductionDims =
-        getReductionIndex(rhsMap, contractOp.getIteratorTypes());
-    // All the reduction dimensions must be a size 1.
-    for (int64_t dim : lhsReductionDims) {
-      if (lhsShape[dim] != 1)
-        return failure();
-    }
-    for (int64_t dim : rhsReductionDims) {
-      if (rhsShape[dim] != 1)
-        return failure();
-    }
-    AffineMap accMap = contractOp.getIndexingMapsArray()[2];
-    unsigned numParallelDims = accMap.getNumResults();
-    unsigned numLhsDimToBroadcast =
-        numParallelDims - (lhsMap.getNumResults() - lhsReductionDims.size());
-    unsigned numRhsDimToBroadcast =
-        numParallelDims - (rhsMap.getNumResults() - rhsReductionDims.size());
-    SmallVector<int64_t> lhsDims;
-    SmallVector<int64_t> lhsTranspose;
-    SmallVector<int64_t> rhsDims;
-    SmallVector<int64_t> rhsTranspose;
-    for (int64_t dim : lhsReductionDims)
-      lhsTranspose.push_back(numLhsDimToBroadcast + dim);
-    for (int64_t dim : rhsReductionDims)
-      rhsTranspose.push_back(numRhsDimToBroadcast + dim);
-    // Loop through the parallel dimensions to calculate the dimensions to
-    // broadcast and to permute in order to extract only parallel dimensions.
-    for (unsigned i = 0; i < numParallelDims; i++) {
-      std::optional<unsigned> lhsDim =
-          getDimPosition(lhsMap, accMap.getDimPosition(i));
-      if (lhsDim) {
-        lhsTranspose.push_back(numLhsDimToBroadcast + *lhsDim);
-      } else {
-        // If the parallel dimension doesn't exist we will have to broadcast it.
-        lhsDims.push_back(
-            contractOp.getResultType().cast<VectorType>().getDimSize(i));
-        lhsTranspose.push_back(lhsDims.size() - 1);
-      }
-      std::optional<unsigned> rhsDim =
-          getDimPosition(rhsMap, accMap.getDimPosition(i));
-      if (rhsDim) {
-        rhsTranspose.push_back(numRhsDimToBroadcast + *rhsDim);
-      } else {
-        // If the parallel dimension doesn't exist we will have to broadcast it.
-        rhsDims.push_back(
-            contractOp.getResultType().cast<VectorType>().getDimSize(i));
-        rhsTranspose.push_back(rhsDims.size() - 1);
-      }
-    }
-    Value newLhs = contractOp.getLhs();
-    Value newRhs = contractOp.getRhs();
-    Location loc = contractOp.getLoc();
-    if (!lhsDims.empty()) {
-      lhsDims.append(lhsShape.begin(), lhsShape.end());
-      auto expandedType =
-          VectorType::get(lhsDims, contractOp.getLhsType().getElementType());
-      newLhs = rewriter.create<vector::BroadcastOp>(loc, expandedType, newLhs);
-    }
-    if (!rhsDims.empty()) {
-      rhsDims.append(rhsShape.begin(), rhsShape.end());
-      auto expandedType =
-          VectorType::get(rhsDims, contractOp.getRhsType().getElementType());
-      newRhs = rewriter.create<vector::BroadcastOp>(loc, expandedType, newRhs);
-    }
-    bool isInt = contractOp.getLhsType().getElementType().isIntOrIndex();
-    newLhs = rewriter.create<vector::TransposeOp>(loc, newLhs, lhsTranspose);
-    newRhs = rewriter.create<vector::TransposeOp>(loc, newRhs, rhsTranspose);
-    SmallVector<int64_t> lhsOffsets(lhsReductionDims.size(), 0);
-    SmallVector<int64_t> rhsOffsets(rhsReductionDims.size(), 0);
-    newLhs = rewriter.create<vector::ExtractOp>(
-        loc, newLhs, rewriter.getI64ArrayAttr(lhsOffsets));
-    newRhs = rewriter.create<vector::ExtractOp>(
-        loc, newRhs, rewriter.getI64ArrayAttr(rhsOffsets));
-    std::optional<Value> result =
-        createContractArithOp(loc, newLhs, newRhs, contractOp.getAcc(),
-                              contractOp.getKind(), rewriter, isInt);
-    rewriter.replaceOp(contractOp, {*result});
-    return success();
-  }
-
-private:
-  /// Options to control the vector patterns.
-  vector::VectorTransformsOptions vectorTransformOptions;
-  FilterConstraintType filter;
-};
-
-/// Progressive lowering of ConstantMaskOp.
-/// One:
-///   %x = vector.constant_mask [a,b]
-/// is replaced by:
-///   %z = zero-result
-///   %l = vector.constant_mask [b]
-///   %4 = vector.insert %l, %z[0]
-///   ..
-///   %x = vector.insert %l, %..[a-1]
-/// until a one-dimensional vector is reached. All these operations
-/// will be folded at LLVM IR level.
-class ConstantMaskOpLowering : public OpRewritePattern<vector::ConstantMaskOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::ConstantMaskOp op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-    auto dstType = op.getType();
-    auto eltType = dstType.getElementType();
-    auto dimSizes = op.getMaskDimSizes();
-    int64_t rank = dstType.getRank();
-
-    if (rank == 0) {
-      assert(dimSizes.size() == 1 &&
-             "Expected exactly one dim size for a 0-D vector");
-      bool value = dimSizes[0].cast<IntegerAttr>().getInt() == 1;
-      rewriter.replaceOpWithNewOp<arith::ConstantOp>(
-          op, dstType,
-          DenseIntElementsAttr::get(
-              VectorType::get(ArrayRef<int64_t>{}, rewriter.getI1Type()),
-              ArrayRef<bool>{value}));
-      return success();
-    }
-
-    // Scalable constant masks can only be lowered for the "none set" case.
-    if (dstType.cast<VectorType>().isScalable()) {
-      rewriter.replaceOpWithNewOp<arith::ConstantOp>(
-          op, DenseElementsAttr::get(dstType, false));
-      return success();
-    }
-
-    int64_t trueDim = std::min(dstType.getDimSize(0),
-                               dimSizes[0].cast<IntegerAttr>().getInt());
-
-    if (rank == 1) {
-      // Express constant 1-D case in explicit vector form:
-      //   [T,..,T,F,..,F].
-      SmallVector<bool> values(dstType.getDimSize(0));
-      for (int64_t d = 0; d < trueDim; d++)
-        values[d] = true;
-      rewriter.replaceOpWithNewOp<arith::ConstantOp>(
-          op, dstType, rewriter.getBoolVectorAttr(values));
-      return success();
-    }
-
-    VectorType lowType =
-        VectorType::get(dstType.getShape().drop_front(), eltType);
-    SmallVector<int64_t> newDimSizes;
-    for (int64_t r = 1; r < rank; r++)
-      newDimSizes.push_back(dimSizes[r].cast<IntegerAttr>().getInt());
-    Value trueVal = rewriter.create<vector::ConstantMaskOp>(
-        loc, lowType, rewriter.getI64ArrayAttr(newDimSizes));
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, dstType, rewriter.getZeroAttr(dstType));
-    for (int64_t d = 0; d < trueDim; d++) {
-      auto pos = rewriter.getI64ArrayAttr(d);
-      result =
-          rewriter.create<vector::InsertOp>(loc, dstType, trueVal, result, pos);
-    }
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
-/// Progressive lowering of CreateMaskOp.
-/// One:
-///   %x = vector.create_mask %a, ... : vector<dx...>
-/// is replaced by:
-///   %l = vector.create_mask ... : vector<...>  ; one lower rank
-///   %0 = arith.cmpi "slt", %ci, %a       |
-///   %1 = select %0, %l, %zeroes    |
-///   %r = vector.insert %1, %pr [i] | d-times
-///   %x = ....
-/// until a one-dimensional vector is reached.
-class CreateMaskOpLowering : public OpRewritePattern<vector::CreateMaskOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::CreateMaskOp op,
-                                PatternRewriter &rewriter) const override {
-    auto dstType = op.getResult().getType().cast<VectorType>();
-    int64_t rank = dstType.getRank();
-    if (rank <= 1)
-      return rewriter.notifyMatchFailure(
-          op, "0-D and 1-D vectors are handled separately");
-
-    auto loc = op.getLoc();
-    auto eltType = dstType.getElementType();
-    int64_t dim = dstType.getDimSize(0);
-    Value idx = op.getOperand(0);
-
-    VectorType lowType =
-        VectorType::get(dstType.getShape().drop_front(), eltType);
-    Value trueVal = rewriter.create<vector::CreateMaskOp>(
-        loc, lowType, op.getOperands().drop_front());
-    Value falseVal = rewriter.create<arith::ConstantOp>(
-        loc, lowType, rewriter.getZeroAttr(lowType));
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, dstType, rewriter.getZeroAttr(dstType));
-    for (int64_t d = 0; d < dim; d++) {
-      Value bnd =
-          rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(d));
-      Value val = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
-                                                 bnd, idx);
-      Value sel = rewriter.create<arith::SelectOp>(loc, val, trueVal, falseVal);
-      auto pos = rewriter.getI64ArrayAttr(d);
-      result =
-          rewriter.create<vector::InsertOp>(loc, dstType, sel, result, pos);
-    }
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
-/// ShapeOp 2D -> 1D downcast serves the purpose of flattening 2-D to 1-D
-/// vectors progressively on the way to target llvm.matrix intrinsics.
-/// This iterates over the most major dimension of the 2-D vector and performs
-/// rewrites into:
-///   vector.extract from 2-D + vector.insert_strided_slice offset into 1-D
-class ShapeCastOp2DDownCastRewritePattern
-    : public OpRewritePattern<vector::ShapeCastOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::ShapeCastOp op,
-                                PatternRewriter &rewriter) const override {
-    auto sourceVectorType = op.getSourceVectorType();
-    auto resultVectorType = op.getResultVectorType();
-    if (sourceVectorType.getRank() != 2 || resultVectorType.getRank() != 1)
-      return failure();
-
-    auto loc = op.getLoc();
-    Value desc = rewriter.create<arith::ConstantOp>(
-        loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
-    unsigned mostMinorVectorSize = sourceVectorType.getShape()[1];
-    for (int64_t i = 0, e = sourceVectorType.getShape().front(); i != e; ++i) {
-      Value vec = rewriter.create<vector::ExtractOp>(loc, op.getSource(), i);
-      desc = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, vec, desc,
-          /*offsets=*/i * mostMinorVectorSize, /*strides=*/1);
-    }
-    rewriter.replaceOp(op, desc);
-    return success();
-  }
-};
-
-/// ShapeOp 1D -> 2D upcast serves the purpose of unflattening 2-D from 1-D
-/// vectors progressively.
-/// This iterates over the most major dimension of the 2-D vector and performs
-/// rewrites into:
-///   vector.extract_strided_slice from 1-D + vector.insert into 2-D
-/// Note that 1-D extract_strided_slice are lowered to efficient vector.shuffle.
-class ShapeCastOp2DUpCastRewritePattern
-    : public OpRewritePattern<vector::ShapeCastOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::ShapeCastOp op,
-                                PatternRewriter &rewriter) const override {
-    auto sourceVectorType = op.getSourceVectorType();
-    auto resultVectorType = op.getResultVectorType();
-    if (sourceVectorType.getRank() != 1 || resultVectorType.getRank() != 2)
-      return failure();
-
-    auto loc = op.getLoc();
-    Value desc = rewriter.create<arith::ConstantOp>(
-        loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
-    unsigned mostMinorVectorSize = resultVectorType.getShape()[1];
-    for (int64_t i = 0, e = resultVectorType.getShape().front(); i != e; ++i) {
-      Value vec = rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, op.getSource(), /*offsets=*/i * mostMinorVectorSize,
-          /*sizes=*/mostMinorVectorSize,
-          /*strides=*/1);
-      desc = rewriter.create<vector::InsertOp>(loc, vec, desc, i);
-    }
-    rewriter.replaceOp(op, desc);
-    return success();
-  }
-};
-
-// We typically should not lower general shape cast operations into data
-// movement instructions, since the assumption is that these casts are
-// optimized away during progressive lowering. For completeness, however,
-// we fall back to a reference implementation that moves all elements
-// into the right place if we get here.
-class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::ShapeCastOp op,
-                                PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    auto sourceVectorType = op.getSourceVectorType();
-    auto resultVectorType = op.getResultVectorType();
-
-    // Special case 2D/1D lowerings with better implementations.
-    // TODO: make is ND/1D to allow generic ND->1D->MD.
-    int64_t srcRank = sourceVectorType.getRank();
-    int64_t resRank = resultVectorType.getRank();
-    if ((srcRank == 2 && resRank == 1) || (srcRank == 1 && resRank == 2))
-      return failure();
-
-    // Generic ShapeCast lowering path goes all the way down to unrolled scalar
-    // extract/insert chains.
-    // TODO: consider evolving the semantics to only allow 1D source or dest and
-    // drop this potentially very expensive lowering.
-    // Compute number of elements involved in the reshape.
-    int64_t numElts = 1;
-    for (int64_t r = 0; r < srcRank; r++)
-      numElts *= sourceVectorType.getDimSize(r);
-    // Replace with data movement operations:
-    //    x[0,0,0] = y[0,0]
-    //    x[0,0,1] = y[0,1]
-    //    x[0,1,0] = y[0,2]
-    // etc., incrementing the two index vectors "row-major"
-    // within the source and result shape.
-    SmallVector<int64_t> srcIdx(srcRank);
-    SmallVector<int64_t> resIdx(resRank);
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
-    for (int64_t i = 0; i < numElts; i++) {
-      if (i != 0) {
-        incIdx(srcIdx, sourceVectorType, srcRank - 1);
-        incIdx(resIdx, resultVectorType, resRank - 1);
-      }
-      Value e = rewriter.create<vector::ExtractOp>(loc, op.getSource(), srcIdx);
-      result = rewriter.create<vector::InsertOp>(loc, e, result, resIdx);
-    }
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-
-private:
-  static void incIdx(SmallVector<int64_t> &idx, VectorType tp, int64_t r) {
-    assert(0 <= r && r < tp.getRank());
-    if (++idx[r] == tp.getDimSize(r)) {
-      idx[r] = 0;
-      incIdx(idx, tp, r - 1);
-    }
-  }
-};
-
 /// Convert MulIOp/MulFOp + MultiDimReductionOp<add> into ContractionOp.
 /// Ex:
 /// ```
@@ -1425,967 +550,6 @@ struct ReorderElementwiseOpsOnTranspose final
   }
 };
 
-} // namespace
-
-/// Creates an AddIOp if `isInt` is true otherwise create an arith::AddFOp using
-/// operands `x` and `y`.
-static Value createAdd(Location loc, Value x, Value y, bool isInt,
-                       PatternRewriter &rewriter) {
-  if (isInt)
-    return rewriter.create<arith::AddIOp>(loc, x, y);
-  return rewriter.create<arith::AddFOp>(loc, x, y);
-}
-
-/// Creates a MulIOp if `isInt` is true otherwise create an MulFOp using
-/// operands `x and `y`.
-static Value createMul(Location loc, Value x, Value y, bool isInt,
-                       PatternRewriter &rewriter) {
-  if (isInt)
-    return rewriter.create<arith::MulIOp>(loc, x, y);
-  return rewriter.create<arith::MulFOp>(loc, x, y);
-}
-
-namespace mlir {
-
-/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul
-/// semantics to:
-/// ```
-///    %mta = maybe_transpose
-///    %mtb = maybe_transpose
-///    %flattened_a = vector.shape_cast %mta
-///    %flattened_b = vector.shape_cast %mtb
-///    %flattened_d = vector.matmul %flattened_a, %flattened_b
-///    %mtd = vector.shape_cast %flattened_d
-///    %d = maybe_untranspose %mtd
-///    %e = add %c, %d
-/// ```
-/// `vector.matmul` later lowers to `llvm.matrix.multiply`.
-//
-/// This only kicks in when VectorTransformsOptions is set to `Matmul`.
-/// vector.transpose operations are inserted if the vector.contract op is not a
-/// row-major matrix multiply.
-LogicalResult
-ContractionOpToMatmulOpLowering::matchAndRewrite(vector::ContractionOp op,
-                                                 PatternRewriter &rew) const {
-  // TODO: Support vector.mask.
-  auto maskableOp = cast<MaskableOpInterface>(op.getOperation());
-  if (maskableOp.isMasked())
-    return failure();
-
-  // TODO: Remove native masks from contraction op?
-  if (!op.getMasks().empty())
-    return failure();
-  if (vectorTransformOptions.vectorContractLowering !=
-      vector::VectorContractLowering::Matmul)
-    return failure();
-  if (failed(filter(op)))
-    return failure();
-
-  auto iteratorTypes = op.getIteratorTypes().getValue();
-  if (!isParallelIterator(iteratorTypes[0]) ||
-      !isParallelIterator(iteratorTypes[1]) ||
-      !isReductionIterator(iteratorTypes[2]))
-    return failure();
-
-  Type elementType = op.getLhsType().getElementType();
-  if (!elementType.isIntOrFloat())
-    return failure();
-
-  Type dstElementType = op.getType();
-  if (auto vecType = dstElementType.dyn_cast<VectorType>())
-    dstElementType = vecType.getElementType();
-  if (elementType != dstElementType)
-    return failure();
-
-  // Perform lhs + rhs transpositions to conform to matmul row-major semantics.
-  // Bail out if the contraction cannot be put in this form.
-  MLIRContext *ctx = op.getContext();
-  Location loc = op.getLoc();
-  AffineExpr m, n, k;
-  bindDims(rew.getContext(), m, n, k);
-  // LHS must be A(m, k) or A(k, m).
-  Value lhs = op.getLhs();
-  auto lhsMap = op.getIndexingMapsArray()[0];
-  if (lhsMap == AffineMap::get(3, 0, {k, m}, ctx))
-    lhs = rew.create<vector::TransposeOp>(loc, lhs, ArrayRef<int64_t>{1, 0});
-  else if (lhsMap != AffineMap::get(3, 0, {m, k}, ctx))
-    return failure();
-
-  // RHS must be B(k, n) or B(n, k).
-  Value rhs = op.getRhs();
-  auto rhsMap = op.getIndexingMapsArray()[1];
-  if (rhsMap == AffineMap::get(3, 0, {n, k}, ctx))
-    rhs = rew.create<vector::TransposeOp>(loc, rhs, ArrayRef<int64_t>{1, 0});
-  else if (rhsMap != AffineMap::get(3, 0, {k, n}, ctx))
-    return failure();
-
-  // At this point lhs and rhs are in row-major.
-  VectorType lhsType = lhs.getType().cast<VectorType>();
-  VectorType rhsType = rhs.getType().cast<VectorType>();
-  int64_t lhsRows = lhsType.getDimSize(0);
-  int64_t lhsColumns = lhsType.getDimSize(1);
-  int64_t rhsColumns = rhsType.getDimSize(1);
-
-  Type flattenedLHSType =
-      VectorType::get(lhsType.getNumElements(), lhsType.getElementType());
-  lhs = rew.create<vector::ShapeCastOp>(loc, flattenedLHSType, lhs);
-
-  Type flattenedRHSType =
-      VectorType::get(rhsType.getNumElements(), rhsType.getElementType());
-  rhs = rew.create<vector::ShapeCastOp>(loc, flattenedRHSType, rhs);
-
-  Value mul = rew.create<vector::MatmulOp>(loc, lhs, rhs, lhsRows, lhsColumns,
-                                           rhsColumns);
-  mul = rew.create<vector::ShapeCastOp>(
-      loc,
-      VectorType::get({lhsRows, rhsColumns},
-                      getElementTypeOrSelf(op.getAcc().getType())),
-      mul);
-
-  // ACC must be C(m, n) or C(n, m).
-  auto accMap = op.getIndexingMapsArray()[2];
-  if (accMap == AffineMap::get(3, 0, {n, m}, ctx))
-    mul = rew.create<vector::TransposeOp>(loc, mul, ArrayRef<int64_t>{1, 0});
-  else if (accMap != AffineMap::get(3, 0, {m, n}, ctx))
-    llvm_unreachable("invalid contraction semantics");
-
-  Value res =
-      elementType.isa<IntegerType>()
-          ? static_cast<Value>(rew.create<arith::AddIOp>(loc, op.getAcc(), mul))
-          : static_cast<Value>(
-                rew.create<arith::AddFOp>(loc, op.getAcc(), mul));
-
-  rew.replaceOp(op, res);
-  return success();
-}
-
-namespace {
-
-/// Generate a vector implementation for matmat, matvec and tmatvec.
-/// This unrolls outer-products along the reduction dimension.
-struct UnrolledOuterProductGenerator
-    : public StructuredGenerator<vector::ContractionOp, vector::IteratorType> {
-  UnrolledOuterProductGenerator(RewriterBase &b, vector::ContractionOp op)
-      : StructuredGenerator<vector::ContractionOp, vector::IteratorType>(b, op),
-        kind(op.getKind()), lhs(op.getLhs()), rhs(op.getRhs()),
-        res(op.getAcc()), lhsType(op.getLhsType()) {
-    auto maskableOp = cast<MaskableOpInterface>(op.getOperation());
-    if (maskableOp.isMasked())
-      mask = maskableOp.getMaskingOp().getMask();
-  }
-
-  Value t(Value v, ArrayRef<int64_t> perm = {1, 0}) {
-    if (!v)
-      return v;
-    return rewriter.create<vector::TransposeOp>(loc, v, perm);
-  }
-
-  Value promote(Value v, Type dstElementType) {
-    Type elementType = v.getType();
-    auto vecType = elementType.dyn_cast<VectorType>();
-    if (vecType)
-      elementType = vecType.getElementType();
-    if (elementType == dstElementType)
-      return v;
-    Type promotedType = dstElementType;
-    if (vecType)
-      promotedType = VectorType::get(vecType.getShape(), promotedType);
-    if (dstElementType.isa<FloatType>())
-      return rewriter.create<arith::ExtFOp>(loc, promotedType, v);
-    return rewriter.create<arith::ExtSIOp>(loc, promotedType, v);
-  }
-
-  FailureOr<Value> outerProd(Value lhs, Value rhs, Value res, int reductionSize,
-                             std::optional<Value> maybeMask = std::nullopt) {
-    assert(reductionSize > 0);
-    // Incremental support for masking.
-    if (mask && !maybeMask.has_value())
-      return failure();
-
-    Type resElementType = res.getType().cast<VectorType>().getElementType();
-    for (int64_t k = 0; k < reductionSize; ++k) {
-      Value extractA = rewriter.create<vector::ExtractOp>(loc, lhs, k);
-      Value extractB = rewriter.create<vector::ExtractOp>(loc, rhs, k);
-      extractA = promote(extractA, resElementType);
-      extractB = promote(extractB, resElementType);
-      Value extractMask;
-      if (maybeMask.has_value() && maybeMask.value())
-        extractMask =
-            rewriter.create<vector::ExtractOp>(loc, maybeMask.value(), k);
-
-      Operation *outerProdOp = rewriter.create<vector::OuterProductOp>(
-          loc, res.getType(), extractA, extractB, res, kind);
-      res = maskOperation(rewriter, outerProdOp, extractMask)->getResult(0);
-    }
-    return res;
-  }
-
-  /// Two outer parallel, one inner reduction (matmat flavor).
-  FailureOr<Value> matmat() {
-    if (!iters({Par(), Par(), Red()}))
-      return failure();
-    // Set up the parallel/reduction structure in the right form.
-    AffineExpr m, n, k;
-    bindDims(rewriter.getContext(), m, n, k);
-    // Classical row-major matmul:  Just permute the lhs.
-    if (layout({{m, k}, {k, n}, {m, n}}))
-      return outerProd(t(lhs), rhs, res, lhsType.getDimSize(1),
-                       t(mask, {2, 0, 1}));
-    // TODO: may be better to fail and use some vector<k> -> scalar reduction.
-    if (layout({{m, k}, {n, k}, {m, n}})) {
-      Value tlhs = t(lhs);
-      return outerProd(tlhs, t(rhs), res, lhsType.getDimSize(1));
-    }
-    // No need to permute anything.
-    if (layout({{k, m}, {k, n}, {m, n}}))
-      return outerProd(lhs, rhs, res, lhsType.getDimSize(0));
-    // Just permute the rhs.
-    if (layout({{k, m}, {n, k}, {m, n}}))
-      return outerProd(lhs, t(rhs), res, lhsType.getDimSize(0));
-    // Transposed output: swap RHS and LHS.
-    // Classical row-major matmul: permute the lhs.
-    if (layout({{m, k}, {k, n}, {n, m}}))
-      return outerProd(rhs, t(lhs), res, lhsType.getDimSize(1));
-    // TODO: may be better to fail and use some vector<k> -> scalar reduction.
-    if (layout({{m, k}, {n, k}, {n, m}})) {
-      Value trhs = t(rhs);
-      return outerProd(trhs, t(lhs), res, lhsType.getDimSize(1));
-    }
-    if (layout({{k, m}, {k, n}, {n, m}}))
-      return outerProd(rhs, lhs, res, lhsType.getDimSize(0));
-    if (layout({{k, m}, {n, k}, {n, m}}))
-      return outerProd(t(rhs), lhs, res, lhsType.getDimSize(0));
-    return failure();
-  }
-
-  /// One outer parallel, one inner reduction (matvec flavor)
-  FailureOr<Value> matvec() {
-    if (!iters({Par(), Red()}))
-      return failure();
-    AffineExpr m, k;
-    bindDims(rewriter.getContext(), m, k);
-
-    // Case mat-vec: transpose.
-    if (layout({{m, k}, {k}, {m}}))
-      return outerProd(t(lhs), rhs, res, lhsType.getDimSize(1), t(mask));
-    // Case mat-trans-vec: ready to go.
-    if (layout({{k, m}, {k}, {m}}))
-      return outerProd(lhs, rhs, res, lhsType.getDimSize(0));
-    // Case vec-mat: swap and transpose.
-    if (layout({{k}, {m, k}, {m}}))
-      return outerProd(t(rhs), lhs, res, lhsType.getDimSize(0));
-    // Case vec-mat-trans: swap and ready to go.
-    if (layout({{k}, {k, m}, {m}}))
-      return outerProd(rhs, lhs, res, lhsType.getDimSize(0));
-    return failure();
-  }
-
-  //
-  // One outer reduction, one inner parallel (tmatvec flavor)
-  //
-  FailureOr<Value> tmatvec() {
-    if (!iters({Red(), Par()}))
-      return failure();
-    AffineExpr k, m;
-    bindDims(rewriter.getContext(), k, m);
-
-    // Case mat-vec: transpose.
-    if (layout({{m, k}, {k}, {m}}))
-      return outerProd(t(lhs), rhs, res, lhsType.getDimSize(1));
-    // Case mat-trans-vec: ready to go.
-    if (layout({{k, m}, {k}, {m}}))
-      return outerProd(lhs, rhs, res, lhsType.getDimSize(0));
-    // Case vec-mat: swap and transpose.
-    if (layout({{k}, {m, k}, {m}}))
-      return outerProd(t(rhs), lhs, res, lhsType.getDimSize(0));
-    // Case vec-mat-trans: swap and ready to go.
-    if (layout({{k}, {k, m}, {m}}))
-      return outerProd(rhs, lhs, res, lhsType.getDimSize(0));
-    return failure();
-  }
-
-private:
-  vector::CombiningKind kind;
-  Value lhs, rhs, res, mask;
-  VectorType lhsType;
-};
-} // namespace
-
-/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul
-/// semantics to a reduction_size-unrolled sequence:
-/// ```
-///    %at = vector.transpose %a, [1, 0]
-///    %bRow0 = vector.extract %b[0]
-///    %atRow0 = vector.extract %at[0]
-///    %c0 = vector.outerproduct %atRow0, %bRow0, %c
-///    ...
-///    %bRowK = vector.extract %b[K]
-///    %atRowK = vector.extract %at[K]
-///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1
-/// ```
-///
-/// This only kicks in when VectorTransformsOptions is set to OuterProduct but
-/// otherwise supports any layout permutation of the matrix-multiply.
-LogicalResult ContractionOpToOuterProductOpLowering::matchAndRewrite(
-    vector::ContractionOp op, PatternRewriter &rewriter) const {
-  // TODO: Remove native masks from contraction op?
-  if (!op.getMasks().empty())
-    return failure();
-
-  if (vectorTransformOptions.vectorContractLowering !=
-      vector::VectorContractLowering::OuterProduct)
-    return failure();
-
-  if (failed(filter(op)))
-    return failure();
-
-  // Vector mask setup.
-  OpBuilder::InsertionGuard guard(rewriter);
-  auto maskableOp = cast<vector::MaskableOpInterface>(op.getOperation());
-  Operation *rootOp;
-  if (maskableOp.isMasked()) {
-    rewriter.setInsertionPoint(maskableOp.getMaskingOp());
-    rootOp = maskableOp.getMaskingOp();
-  } else {
-    rootOp = op;
-  }
-
-  UnrolledOuterProductGenerator e(rewriter, op);
-  FailureOr<Value> matmatRes = e.matmat();
-  if (succeeded(matmatRes)) {
-    rewriter.replaceOp(rootOp, *matmatRes);
-    return success();
-  }
-  FailureOr<Value> matvecRes = e.matvec();
-  if (succeeded(matvecRes)) {
-    rewriter.replaceOp(rootOp, *matvecRes);
-    return success();
-  }
-  FailureOr<Value> tmatvecRes = e.tmatvec();
-  if (succeeded(tmatvecRes)) {
-    rewriter.replaceOp(rootOp, *tmatvecRes);
-    return success();
-  }
-
-  return failure();
-}
-
-LogicalResult
-ContractionOpToDotLowering::matchAndRewrite(vector::ContractionOp op,
-                                            PatternRewriter &rewriter) const {
-  // TODO: Support vector.mask.
-  auto maskableOp = cast<MaskableOpInterface>(op.getOperation());
-  if (maskableOp.isMasked())
-    return failure();
-
-  // TODO: Remove native masks from contraction op?
-  if (!op.getMasks().empty())
-    return failure();
-
-  if (failed(filter(op)))
-    return failure();
-
-  if (vectorTransformOptions.vectorContractLowering !=
-      vector::VectorContractLowering::Dot)
-    return failure();
-
-  auto iteratorTypes = op.getIteratorTypes().getValue();
-  static constexpr std::array<int64_t, 2> perm = {1, 0};
-  Location loc = op.getLoc();
-  Value lhs = op.getLhs(), rhs = op.getRhs();
-
-  using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
-  AffineExpr m, n, k;
-  bindDims(rewriter.getContext(), m, n, k);
-  SmallVector<AffineMap> maps = op.getIndexingMapsArray();
-  //
-  // In the following we wish to make the reduction dimension innermost so we
-  // can load vectors and just fmul + reduce into a scalar.
-  //
-  if (isParallelIterator(iteratorTypes[0]) &&
-      isParallelIterator(iteratorTypes[1]) &&
-      isReductionIterator(iteratorTypes[2])) {
-    //
-    // Two outer parallel, one inner reduction (matmat flavor).
-    //
-    if (maps == infer({{m, k}, {k, n}, {m, n}})) {
-      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
-    } else if (maps == infer({{m, k}, {n, k}, {m, n}})) {
-      // No need to permute anything.
-    } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
-      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
-    } else if (maps == infer({{k, m}, {n, k}, {m, n}})) {
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
-    } else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
-      // This is the classical row-major matmul. Just permute the lhs.
-      Value tmp = lhs;
-      lhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
-      rhs = tmp;
-    } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
-      std::swap(lhs, rhs);
-    } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
-      Value tmp = lhs;
-      lhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
-      rhs = rewriter.create<vector::TransposeOp>(loc, tmp, perm);
-    } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
-      Value tmp = rhs;
-      rhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
-      lhs = tmp;
-    } else {
-      return failure();
-    }
-  } else if (isParallelIterator(iteratorTypes[0]) &&
-             isReductionIterator(iteratorTypes[1])) {
-    //
-    // One outer parallel, one inner reduction (matvec flavor)
-    //
-    if (maps == infer({{m, n}, {n}, {m}})) {
-      // No need to permute anything.
-    } else if (maps == infer({{n, m}, {n}, {m}})) {
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
-    } else if (maps == infer({{n}, {m, n}, {m}})) {
-      std::swap(lhs, rhs);
-    } else if (maps == infer({{n}, {n, m}, {m}})) {
-      std::swap(lhs, rhs);
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
-    } else {
-      return failure();
-    }
-  } else {
-    return failure();
-  }
-
-  VectorType dstType = op.getResultType().cast<VectorType>();
-  assert(dstType.getRank() >= 1 && dstType.getRank() <= 2 &&
-         "Expected dst type of rank 1 or 2");
-
-  unsigned rank = dstType.getRank();
-  unsigned dstRows = dstType.getShape()[0];
-  unsigned dstColumns = rank == 1 ? 1 : dstType.getShape()[1];
-
-  // ExtractOp does not allow dynamic indexing, we must unroll explicitly.
-  Value res = rewriter.create<arith::ConstantOp>(loc, dstType,
-                                                 rewriter.getZeroAttr(dstType));
-  bool isInt = dstType.getElementType().isa<IntegerType>();
-  for (unsigned r = 0; r < dstRows; ++r) {
-    Value a = rewriter.create<vector::ExtractOp>(op.getLoc(), lhs, r);
-    for (unsigned c = 0; c < dstColumns; ++c) {
-      Value b = rank == 1
-                    ? rhs
-                    : rewriter.create<vector::ExtractOp>(op.getLoc(), rhs, c);
-      Value m = createMul(op.getLoc(), a, b, isInt, rewriter);
-      Value reduced = rewriter.create<vector::ReductionOp>(
-          op.getLoc(), vector::CombiningKind::ADD, m);
-
-      SmallVector<int64_t, 2> pos = rank == 1 ? SmallVector<int64_t, 2>{r}
-                                              : SmallVector<int64_t, 2>{r, c};
-      res = rewriter.create<vector::InsertOp>(op.getLoc(), reduced, res, pos);
-    }
-  }
-  if (auto acc = op.getAcc())
-    res = createAdd(op.getLoc(), res, acc, isInt, rewriter);
-  rewriter.replaceOp(op, res);
-  return success();
-}
-
-/// Progressive lowering of ContractionOp.
-/// One:
-///   %x = vector.contract with at least one free/batch dimension
-/// is replaced by:
-///   %a = vector.contract with one less free/batch dimension
-///   %b = vector.contract with one less free/batch dimension
-///   ..
-///   %x = combine %a %b ..
-/// until a pure contraction is reached (no free/batch dimensions),
-/// which is replaced by a dot-product.
-///
-/// This only kicks in when either VectorTransformsOptions is set
-/// to DOT or when other contraction patterns fail.
-//
-// TODO: break down into transpose/reshape/cast ops
-//               when they become available to avoid code dup
-// TODO: investigate lowering order impact on performance
-LogicalResult
-ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
-                                       PatternRewriter &rewriter) const {
-  // TODO: Remove native masks from contraction op?
-  if (!op.getMasks().empty())
-    return failure();
-
-  if (failed(filter(op)))
-    return failure();
-
-  // TODO: support mixed mode contract lowering.
-  if (op.getLhsType().getElementType() !=
-          getElementTypeOrSelf(op.getAccType()) ||
-      op.getRhsType().getElementType() != getElementTypeOrSelf(op.getAccType()))
-    return failure();
-
-  // TODO: the code below assumes the default contraction, make sure it supports
-  // other kinds before enabling this lowering.
-  if (op.getKind() != vector::CombiningKind::ADD) {
-    return rewriter.notifyMatchFailure(
-        op, "contractions other than 'add' not supported");
-  }
-
-  // TODO: implement benefits, cost models.
-  MLIRContext *ctx = op.getContext();
-  ContractionOpToMatmulOpLowering pat1(vectorTransformOptions, ctx);
-  if (succeeded(pat1.matchAndRewrite(op, rewriter)))
-    return success();
-  ContractionOpToOuterProductOpLowering pat2(vectorTransformOptions, ctx);
-  if (succeeded(pat2.matchAndRewrite(op, rewriter)))
-    return success();
-  ContractionOpToDotLowering pat3(vectorTransformOptions, ctx);
-  if (succeeded(pat3.matchAndRewrite(op, rewriter)))
-    return success();
-  ContractOpToElementwise pat4(vectorTransformOptions, ctx);
-  if (succeeded(pat4.matchAndRewrite(op, rewriter)))
-    return success();
-
-  // Vector mask setup.
-  OpBuilder::InsertionGuard guard(rewriter);
-  Operation *rootOp = op;
-  Value mask;
-  if (op.isMasked()) {
-    rewriter.setInsertionPoint(op.getMaskingOp());
-    rootOp = op.getMaskingOp();
-    mask = op.getMaskingOp().getMask();
-  }
-
-  // Find first batch dimension in LHS/RHS, and lower when found.
-  std::vector<std::pair<int64_t, int64_t>> batchDimMap = op.getBatchDimMap();
-  if (!batchDimMap.empty()) {
-    int64_t lhsIndex = batchDimMap[0].first;
-    int64_t rhsIndex = batchDimMap[0].second;
-    auto newOp = lowerParallel(rewriter, op, lhsIndex, rhsIndex, mask);
-    if (failed(newOp))
-      return failure();
-    rewriter.replaceOp(rootOp, *newOp);
-    return success();
-  }
-
-  // Collect contracting dimensions.
-  std::vector<std::pair<int64_t, int64_t>> contractingDimMap =
-      op.getContractingDimMap();
-  DenseSet<int64_t> lhsContractingDimSet;
-  DenseSet<int64_t> rhsContractingDimSet;
-  for (auto &dimPair : contractingDimMap) {
-    lhsContractingDimSet.insert(dimPair.first);
-    rhsContractingDimSet.insert(dimPair.second);
-  }
-
-  // Find first free dimension in LHS, and lower when found.
-  VectorType lhsType = op.getLhsType();
-  for (int64_t lhsIndex = 0, e = lhsType.getRank(); lhsIndex < e; ++lhsIndex) {
-    if (lhsContractingDimSet.count(lhsIndex) == 0) {
-      auto newOp = lowerParallel(rewriter, op, lhsIndex, /*rhsIndex=*/-1, mask);
-      if (failed(newOp))
-        return failure();
-      rewriter.replaceOp(rootOp, *newOp);
-      return success();
-    }
-  }
-
-  // Find first free dimension in RHS, and lower when found.
-  VectorType rhsType = op.getRhsType();
-  for (int64_t rhsIndex = 0, e = rhsType.getRank(); rhsIndex < e; ++rhsIndex) {
-    if (rhsContractingDimSet.count(rhsIndex) == 0) {
-      auto newOp = lowerParallel(rewriter, op, /*lhsIndex=*/-1, rhsIndex, mask);
-      if (failed(newOp))
-        return failure();
-      rewriter.replaceOp(rootOp, *newOp);
-      return success();
-    }
-  }
-
-  // Lower the first remaining reduction dimension.
-  if (!contractingDimMap.empty()) {
-    auto newOp = lowerReduction(rewriter, op, mask);
-    if (failed(newOp))
-      return failure();
-    rewriter.replaceOp(rootOp, *newOp);
-    return success();
-  }
-
-  return failure();
-}
-
-// Lower one parallel dimension.
-// Incidentally also tolerates unit-size (hence trivial) reduction dimensions.
-// TODO: consider reusing existing contract unrolling
-FailureOr<Value> ContractionOpLowering::lowerParallel(PatternRewriter &rewriter,
-                                                      vector::ContractionOp op,
-                                                      int64_t lhsIndex,
-                                                      int64_t rhsIndex,
-                                                      Value mask) const {
-  VectorType lhsType = op.getLhsType();
-  VectorType rhsType = op.getRhsType();
-  VectorType resType = op.getResultType().cast<VectorType>();
-  // Find the iterator type index and result index.
-  SmallVector<AffineMap> iMap = op.getIndexingMapsArray();
-  int64_t iterIndex = -1;
-  int64_t dimSize = -1;
-  if (lhsIndex >= 0) {
-    iterIndex = iMap[0].getDimPosition(lhsIndex);
-    if (rhsIndex >= 0 && iterIndex != iMap[1].getDimPosition(rhsIndex))
-      return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
-        diag << "expected lhsIndex=" << lhsIndex << " and rhsIndex=" << rhsIndex
-             << " to map to the same dimension";
-      });
-    dimSize = lhsType.getDimSize(lhsIndex);
-  } else if (rhsIndex >= 0) {
-    iterIndex = iMap[1].getDimPosition(rhsIndex);
-    dimSize = rhsType.getDimSize(rhsIndex);
-  }
-  if (iterIndex < 0)
-    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
-      diag << "expected either lhsIndex=" << lhsIndex
-           << " or rhsIndex=" << rhsIndex << " to be nonnegative";
-    });
-  // value_or(-1) means that we tolerate a dimension not appearing
-  // in the result map. That can't happen for actual parallel iterators, but
-  // the caller ContractionOpLowering::matchAndRewrite is currently calling
-  // lowerParallel also for the case of unit-size reduction dims appearing only
-  // on one of LHS or RHS, not both. At the moment, such cases are created by
-  // CastAwayContractionLeadingOneDim, so we need to either support that or
-  // modify that pattern.
-  int64_t resIndex = getResultIndex(iMap[2], iterIndex).value_or(-1);
-  if (resIndex == -1 && dimSize != 1)
-    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
-      diag << "expected the dimension for iterIndex=" << iterIndex
-           << " to either appear in the result map, or to be a unit dimension";
-    });
-
-  // Construct new iterator types and affine map array attribute.
-  std::array<AffineMap, 3> lowIndexingMaps = {
-      adjustMap(iMap[0], iterIndex, rewriter),
-      adjustMap(iMap[1], iterIndex, rewriter),
-      adjustMap(iMap[2], iterIndex, rewriter)};
-  auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
-  auto lowIter =
-      rewriter.getArrayAttr(adjustIter(op.getIteratorTypes(), iterIndex));
-  // Unroll into a series of lower dimensional vector.contract ops.
-  Location loc = op.getLoc();
-  Value result = rewriter.create<arith::ConstantOp>(
-      loc, resType, rewriter.getZeroAttr(resType));
-
-  for (int64_t d = 0; d < dimSize; ++d) {
-    auto lhs = reshapeLoad(loc, op.getLhs(), lhsType, lhsIndex, d, rewriter);
-    auto rhs = reshapeLoad(loc, op.getRhs(), rhsType, rhsIndex, d, rewriter);
-    auto acc = reshapeLoad(loc, op.getAcc(), resType, resIndex, d, rewriter);
-
-    Value lowMask;
-    if (mask)
-      lowMask = reshapeLoad(loc, mask, cast<VectorType>(mask.getType()),
-                            iterIndex, d, rewriter);
-
-    Operation *lowContract = rewriter.create<vector::ContractionOp>(
-        loc, lhs, rhs, acc, lowAffine, lowIter);
-    lowContract = maskOperation(rewriter, lowContract, lowMask);
-    result = reshapeStore(loc, lowContract->getResult(0), result, resType,
-                          resIndex, d, rewriter);
-  }
-  return result;
-}
-
-// Lower one reduction dimension.
-FailureOr<Value> ContractionOpLowering::lowerReduction(
-    PatternRewriter &rewriter, vector::ContractionOp op, Value mask) const {
-  auto loc = op.getLoc();
-  VectorType lhsType = op.getLhsType();
-  VectorType rhsType = op.getRhsType();
-  Type resType = op.getResultType();
-  if (resType.isa<VectorType>())
-    return rewriter.notifyMatchFailure(op,
-                                       "did not expect a VectorType result");
-  bool isInt = resType.isa<IntegerType>();
-  // Use iterator index 0.
-  int64_t iterIndex = 0;
-  SmallVector<AffineMap> iMap = op.getIndexingMapsArray();
-  std::optional<int64_t> lookupLhs = getResultIndex(iMap[0], iterIndex);
-  std::optional<int64_t> lookupRhs = getResultIndex(iMap[1], iterIndex);
-  if (!lookupLhs.has_value())
-    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
-      diag << "expected iterIndex=" << iterIndex << "to map to a LHS dimension";
-    });
-  if (!lookupRhs.has_value())
-    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
-      diag << "expected iterIndex=" << iterIndex << "to map to a RHS dimension";
-    });
-  int64_t lhsIndex = *lookupLhs;
-  int64_t rhsIndex = *lookupRhs;
-  int64_t dimSize = lhsType.getDimSize(lhsIndex);
-  if (dimSize != rhsType.getDimSize(rhsIndex))
-    return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
-      diag << "expect LHS dimension " << lhsIndex
-           << " to have the same size as RHS dimension " << rhsIndex;
-    });
-  // Base case.
-  if (lhsType.getRank() == 1) {
-    if (rhsType.getRank() != 1)
-      return rewriter.notifyMatchFailure(
-          op, "When LHS has rank 1, expected also RHS to have rank 1");
-    Value m = createMul(loc, op.getLhs(), op.getRhs(), isInt, rewriter);
-    auto kind = vector::CombiningKind::ADD;
-
-    Value acc = op.getAcc();
-    Operation *reductionOp =
-        acc ? rewriter.create<vector::ReductionOp>(loc, kind, m, acc)
-            : rewriter.create<vector::ReductionOp>(loc, kind, m);
-    return maskOperation(rewriter, reductionOp, mask)->getResult(0);
-  }
-  // Construct new iterator types and affine map array attribute.
-  std::array<AffineMap, 3> lowIndexingMaps = {
-      adjustMap(iMap[0], iterIndex, rewriter),
-      adjustMap(iMap[1], iterIndex, rewriter),
-      adjustMap(iMap[2], iterIndex, rewriter)};
-  auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
-  auto lowIter =
-      rewriter.getArrayAttr(adjustIter(op.getIteratorTypes(), iterIndex));
-  // Unroll into a series of lower dimensional vector.contract ops.
-  // By feeding the initial accumulator into the first contraction,
-  // and the result of each contraction into the next, eventually
-  // the sum of all reductions is computed.
-  Value result = op.getAcc();
-  for (int64_t d = 0; d < dimSize; ++d) {
-    auto lhs = reshapeLoad(loc, op.getLhs(), lhsType, lhsIndex, d, rewriter);
-    auto rhs = reshapeLoad(loc, op.getRhs(), rhsType, rhsIndex, d, rewriter);
-    Value newMask;
-    if (mask)
-      newMask = reshapeLoad(loc, mask, cast<VectorType>(mask.getType()),
-                            iterIndex, d, rewriter);
-
-    Operation *newContract = rewriter.create<vector::ContractionOp>(
-        loc, lhs, rhs, result, lowAffine, lowIter);
-    result = maskOperation(rewriter, newContract, newMask)->getResult(0);
-  }
-  return result;
-}
-
-} // namespace mlir
-
-/// Progressive lowering of transfer_read. This pattern supports lowering of
-/// `vector.transfer_read` to a combination of `vector.load` and
-/// `vector.broadcast` if all of the following hold:
-/// - Stride of most minor memref dimension must be 1.
-/// - Out-of-bounds masking is not required.
-/// - If the memref's element type is a vector type then it coincides with the
-///   result type.
-/// - The permutation map doesn't perform permutation (broadcasting is allowed).
-struct TransferReadToVectorLoadLowering
-    : public OpRewritePattern<vector::TransferReadOp> {
-  TransferReadToVectorLoadLowering(MLIRContext *context,
-                                   std::optional<unsigned> maxRank,
-                                   PatternBenefit benefit = 1)
-      : OpRewritePattern<vector::TransferReadOp>(context, benefit),
-        maxTransferRank(maxRank) {}
-
-  LogicalResult matchAndRewrite(vector::TransferReadOp read,
-                                PatternRewriter &rewriter) const override {
-    if (maxTransferRank && read.getVectorType().getRank() > *maxTransferRank)
-      return failure();
-
-    SmallVector<unsigned> broadcastedDims;
-    // Permutations are handled by VectorToSCF or
-    // populateVectorTransferPermutationMapLoweringPatterns.
-    // We let the 0-d corner case pass-through as it is supported.
-    if (!read.getPermutationMap().isMinorIdentityWithBroadcasting(
-            &broadcastedDims))
-      return failure();
-
-    auto memRefType = read.getShapedType().dyn_cast<MemRefType>();
-    if (!memRefType)
-      return failure();
-
-    // Non-unit strides are handled by VectorToSCF.
-    if (!vector::isLastMemrefDimUnitStride(memRefType))
-      return failure();
-
-    // If there is broadcasting involved then we first load the unbroadcasted
-    // vector, and then broadcast it with `vector.broadcast`.
-    ArrayRef<int64_t> vectorShape = read.getVectorType().getShape();
-    SmallVector<int64_t> unbroadcastedVectorShape(vectorShape.begin(),
-                                                  vectorShape.end());
-    for (unsigned i : broadcastedDims)
-      unbroadcastedVectorShape[i] = 1;
-    VectorType unbroadcastedVectorType = VectorType::get(
-        unbroadcastedVectorShape, read.getVectorType().getElementType());
-
-    // `vector.load` supports vector types as memref's elements only when the
-    // resulting vector type is the same as the element type.
-    auto memrefElTy = memRefType.getElementType();
-    if (memrefElTy.isa<VectorType>() && memrefElTy != unbroadcastedVectorType)
-      return failure();
-
-    // Otherwise, element types of the memref and the vector must match.
-    if (!memrefElTy.isa<VectorType>() &&
-        memrefElTy != read.getVectorType().getElementType())
-      return failure();
-
-    // Out-of-bounds dims are handled by MaterializeTransferMask.
-    if (read.hasOutOfBoundsDim())
-      return failure();
-
-    // Create vector load op.
-    Operation *loadOp;
-    if (read.getMask()) {
-      Value fill = rewriter.create<vector::SplatOp>(
-          read.getLoc(), unbroadcastedVectorType, read.getPadding());
-      loadOp = rewriter.create<vector::MaskedLoadOp>(
-          read.getLoc(), unbroadcastedVectorType, read.getSource(),
-          read.getIndices(), read.getMask(), fill);
-    } else {
-      loadOp = rewriter.create<vector::LoadOp>(
-          read.getLoc(), unbroadcastedVectorType, read.getSource(),
-          read.getIndices());
-    }
-
-    // Insert a broadcasting op if required.
-    if (!broadcastedDims.empty()) {
-      rewriter.replaceOpWithNewOp<vector::BroadcastOp>(
-          read, read.getVectorType(), loadOp->getResult(0));
-    } else {
-      rewriter.replaceOp(read, loadOp->getResult(0));
-    }
-
-    return success();
-  }
-
-  std::optional<unsigned> maxTransferRank;
-};
-
-/// Replace a 0-d vector.load with a memref.load + vector.broadcast.
-// TODO: we shouldn't cross the vector/scalar domains just for this
-// but atm we lack the infra to avoid it. Possible solutions include:
-// - go directly to LLVM + bitcast
-// - introduce a bitcast op and likely a new pointer dialect
-// - let memref.load/store additionally support the 0-d vector case
-// There are still deeper data layout issues lingering even in this
-// trivial case (for architectures for which this matters).
-struct VectorLoadToMemrefLoadLowering
-    : public OpRewritePattern<vector::LoadOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::LoadOp loadOp,
-                                PatternRewriter &rewriter) const override {
-    auto vecType = loadOp.getVectorType();
-    if (vecType.getNumElements() != 1)
-      return failure();
-    auto memrefLoad = rewriter.create<memref::LoadOp>(
-        loadOp.getLoc(), loadOp.getBase(), loadOp.getIndices());
-    rewriter.replaceOpWithNewOp<vector::BroadcastOp>(loadOp, vecType,
-                                                     memrefLoad);
-    return success();
-  }
-};
-
-/// Replace a 0-d vector.store with a vector.extractelement + memref.store.
-struct VectorStoreToMemrefStoreLowering
-    : public OpRewritePattern<vector::StoreOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::StoreOp storeOp,
-                                PatternRewriter &rewriter) const override {
-    auto vecType = storeOp.getVectorType();
-    if (vecType.getNumElements() != 1)
-      return failure();
-    Value extracted;
-    if (vecType.getRank() == 0) {
-      // TODO: Unifiy once ExtractOp supports 0-d vectors.
-      extracted = rewriter.create<vector::ExtractElementOp>(
-          storeOp.getLoc(), storeOp.getValueToStore());
-    } else {
-      SmallVector<int64_t> indices(vecType.getRank(), 0);
-      extracted = rewriter.create<vector::ExtractOp>(
-          storeOp.getLoc(), storeOp.getValueToStore(), indices);
-    }
-
-    rewriter.replaceOpWithNewOp<memref::StoreOp>(
-        storeOp, extracted, storeOp.getBase(), storeOp.getIndices());
-    return success();
-  }
-};
-
-/// Progressive lowering of transfer_write. This pattern supports lowering of
-/// `vector.transfer_write` to `vector.store` if all of the following hold:
-/// - Stride of most minor memref dimension must be 1.
-/// - Out-of-bounds masking is not required.
-/// - If the memref's element type is a vector type then it coincides with the
-///   type of the written value.
-/// - The permutation map is the minor identity map (neither permutation nor
-///   broadcasting is allowed).
-struct TransferWriteToVectorStoreLowering
-    : public OpRewritePattern<vector::TransferWriteOp> {
-  TransferWriteToVectorStoreLowering(MLIRContext *context,
-                                     std::optional<unsigned> maxRank,
-                                     PatternBenefit benefit = 1)
-      : OpRewritePattern<vector::TransferWriteOp>(context, benefit),
-        maxTransferRank(maxRank) {}
-
-  LogicalResult matchAndRewrite(vector::TransferWriteOp write,
-                                PatternRewriter &rewriter) const override {
-    if (maxTransferRank && write.getVectorType().getRank() > *maxTransferRank)
-      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
-        diag << "rank exceeds maxTransferRank: " << write;
-      });
-
-    // Permutations are handled by VectorToSCF or
-    // populateVectorTransferPermutationMapLoweringPatterns.
-    if ( // pass-through for the 0-d corner case.
-        !write.getPermutationMap().isMinorIdentity())
-      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
-        diag << "permutation map is not minor identity: " << write;
-      });
-
-    auto memRefType = write.getShapedType().dyn_cast<MemRefType>();
-    if (!memRefType)
-      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
-        diag << "not a memref type: " << write;
-      });
-
-    // Non-unit strides are handled by VectorToSCF.
-    if (!vector::isLastMemrefDimUnitStride(memRefType))
-      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
-        diag << "most minor stride is not 1: " << write;
-      });
-
-    // `vector.store` supports vector types as memref's elements only when the
-    // type of the vector value being written is the same as the element type.
-    auto memrefElTy = memRefType.getElementType();
-    if (memrefElTy.isa<VectorType>() && memrefElTy != write.getVectorType())
-      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
-        diag << "elemental type mismatch: " << write;
-      });
-
-    // Otherwise, element types of the memref and the vector must match.
-    if (!memrefElTy.isa<VectorType>() &&
-        memrefElTy != write.getVectorType().getElementType())
-      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
-        diag << "elemental type mismatch: " << write;
-      });
-
-    // Out-of-bounds dims are handled by MaterializeTransferMask.
-    if (write.hasOutOfBoundsDim())
-      return rewriter.notifyMatchFailure(write.getLoc(), [=](Diagnostic &diag) {
-        diag << "out of bounds dim: " << write;
-      });
-    if (write.getMask()) {
-      rewriter.replaceOpWithNewOp<vector::MaskedStoreOp>(
-          write, write.getSource(), write.getIndices(), write.getMask(),
-          write.getVector());
-    } else {
-      rewriter.replaceOpWithNewOp<vector::StoreOp>(
-          write, write.getVector(), write.getSource(), write.getIndices());
-    }
-    return success();
-  }
-
-  std::optional<unsigned> maxTransferRank;
-};
-
 // Returns the values in `arrayAttr` as an integer vector.
 static SmallVector<int64_t> getIntValueVector(ArrayAttr arrayAttr) {
   return llvm::to_vector<4>(
@@ -2863,202 +1027,6 @@ class DropInnerMostUnitDims : public OpRewritePattern<vector::TransferReadOp> {
   }
 };
 
-namespace {
-
-/// This function checks to see if the vector combining kind
-/// is consistent with the integer or float element type.
-static bool isValidKind(bool isInt, vector::CombiningKind kind) {
-  using vector::CombiningKind;
-  enum class KindType { FLOAT, INT, INVALID };
-  KindType type{KindType::INVALID};
-  switch (kind) {
-  case CombiningKind::MINF:
-  case CombiningKind::MAXF:
-    type = KindType::FLOAT;
-    break;
-  case CombiningKind::MINUI:
-  case CombiningKind::MINSI:
-  case CombiningKind::MAXUI:
-  case CombiningKind::MAXSI:
-  case CombiningKind::AND:
-  case CombiningKind::OR:
-  case CombiningKind::XOR:
-    type = KindType::INT;
-    break;
-  case CombiningKind::ADD:
-  case CombiningKind::MUL:
-    type = isInt ? KindType::INT : KindType::FLOAT;
-    break;
-  }
-  bool isValidIntKind = (type == KindType::INT) && isInt;
-  bool isValidFloatKind = (type == KindType::FLOAT) && (!isInt);
-  return (isValidIntKind || isValidFloatKind);
-}
-
-/// This function constructs the appropriate integer or float
-/// operation given the vector combining kind and operands. The
-/// supported int operations are : add, mul, min (signed/unsigned),
-/// max(signed/unsigned), and, or, xor. The supported float
-/// operations are : add, mul, min and max.
-static Value genOperator(Location loc, Value x, Value y,
-                         vector::CombiningKind kind,
-                         PatternRewriter &rewriter) {
-  using vector::CombiningKind;
-
-  auto elType = x.getType().cast<VectorType>().getElementType();
-  bool isInt = elType.isIntOrIndex();
-
-  Value combinedResult{nullptr};
-  switch (kind) {
-  case CombiningKind::ADD:
-    if (isInt)
-      combinedResult = rewriter.create<arith::AddIOp>(loc, x, y);
-    else
-      combinedResult = rewriter.create<arith::AddFOp>(loc, x, y);
-    break;
-  case CombiningKind::MUL:
-    if (isInt)
-      combinedResult = rewriter.create<arith::MulIOp>(loc, x, y);
-    else
-      combinedResult = rewriter.create<arith::MulFOp>(loc, x, y);
-    break;
-  case CombiningKind::MINUI:
-    combinedResult = rewriter.create<arith::MinUIOp>(loc, x, y);
-    break;
-  case CombiningKind::MINSI:
-    combinedResult = rewriter.create<arith::MinSIOp>(loc, x, y);
-    break;
-  case CombiningKind::MAXUI:
-    combinedResult = rewriter.create<arith::MaxUIOp>(loc, x, y);
-    break;
-  case CombiningKind::MAXSI:
-    combinedResult = rewriter.create<arith::MaxSIOp>(loc, x, y);
-    break;
-  case CombiningKind::AND:
-    combinedResult = rewriter.create<arith::AndIOp>(loc, x, y);
-    break;
-  case CombiningKind::OR:
-    combinedResult = rewriter.create<arith::OrIOp>(loc, x, y);
-    break;
-  case CombiningKind::XOR:
-    combinedResult = rewriter.create<arith::XOrIOp>(loc, x, y);
-    break;
-  case CombiningKind::MINF:
-    combinedResult = rewriter.create<arith::MinFOp>(loc, x, y);
-    break;
-  case CombiningKind::MAXF:
-    combinedResult = rewriter.create<arith::MaxFOp>(loc, x, y);
-    break;
-  }
-  return combinedResult;
-}
-
-/// Convert vector.scan op into arith ops and
-/// vector.insert_strided_slice/extract_strided_slice
-///
-/// Ex:
-/// ```
-///   %0:2 = vector.scan <add>, %arg0, %arg1 {inclusive = true, reduction_dim =
-///   1} :
-///     (vector<2x3xi32>, vector<2xi32>) to (vector<2x3xi32>, vector<2xi32>)
-/// ```
-/// Gets converted to:
-/// ```
-///   %cst = arith.constant dense<0> : vector<2x3xi32>
-///   %0 = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [2, 1],
-///   strides = [1, 1]} : vector<2x3xi32> to vector<2x1xi32> %1 =
-///   vector.insert_strided_slice %0, %cst {offsets = [0, 0], strides = [1, 1]}
-///   : vector<2x1xi32> into vector<2x3xi32> %2 = vector.extract_strided_slice
-///   %arg0 {offsets = [0, 1], sizes = [2, 1], strides = [1, 1]} :
-///   vector<2x3xi32> to vector<2x1xi32> %3 = arith.muli %0, %2 :
-///   vector<2x1xi32> %4 = vector.insert_strided_slice %3, %1 {offsets = [0, 1],
-///   strides = [1, 1]} : vector<2x1xi32> into vector<2x3xi32> %5 =
-///   vector.extract_strided_slice %arg0 {offsets = [0, 2], sizes = [2, 1],
-///   strides = [1, 1]} : vector<2x3xi32> to vector<2x1xi32> %6 = arith.muli %3,
-///   %5 : vector<2x1xi32> %7 = vector.insert_strided_slice %6, %4 {offsets =
-///   [0, 2], strides = [1, 1]} : vector<2x1xi32> into vector<2x3xi32> %8 =
-///   vector.shape_cast %6 : vector<2x1xi32> to vector<2xi32> return %7, %8 :
-///   vector<2x3xi32>, vector<2xi32>
-/// ```
-struct ScanToArithOps : public OpRewritePattern<vector::ScanOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::ScanOp scanOp,
-                                PatternRewriter &rewriter) const override {
-    auto loc = scanOp.getLoc();
-    VectorType destType = scanOp.getDestType();
-    ArrayRef<int64_t> destShape = destType.getShape();
-    auto elType = destType.getElementType();
-    bool isInt = elType.isIntOrIndex();
-    if (!isValidKind(isInt, scanOp.getKind()))
-      return failure();
-
-    VectorType resType = VectorType::get(destShape, elType);
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resType, rewriter.getZeroAttr(resType));
-    int64_t reductionDim = scanOp.getReductionDim();
-    bool inclusive = scanOp.getInclusive();
-    int64_t destRank = destType.getRank();
-    VectorType initialValueType = scanOp.getInitialValueType();
-    int64_t initialValueRank = initialValueType.getRank();
-
-    SmallVector<int64_t> reductionShape(destShape.begin(), destShape.end());
-    reductionShape[reductionDim] = 1;
-    VectorType reductionType = VectorType::get(reductionShape, elType);
-    SmallVector<int64_t> offsets(destRank, 0);
-    SmallVector<int64_t> strides(destRank, 1);
-    SmallVector<int64_t> sizes(destShape.begin(), destShape.end());
-    sizes[reductionDim] = 1;
-    ArrayAttr scanSizes = rewriter.getI64ArrayAttr(sizes);
-    ArrayAttr scanStrides = rewriter.getI64ArrayAttr(strides);
-
-    Value lastOutput, lastInput;
-    for (int i = 0; i < destShape[reductionDim]; i++) {
-      offsets[reductionDim] = i;
-      ArrayAttr scanOffsets = rewriter.getI64ArrayAttr(offsets);
-      Value input = rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, reductionType, scanOp.getSource(), scanOffsets, scanSizes,
-          scanStrides);
-      Value output;
-      if (i == 0) {
-        if (inclusive) {
-          output = input;
-        } else {
-          if (initialValueRank == 0) {
-            // ShapeCastOp cannot handle 0-D vectors
-            output = rewriter.create<vector::BroadcastOp>(
-                loc, input.getType(), scanOp.getInitialValue());
-          } else {
-            output = rewriter.create<vector::ShapeCastOp>(
-                loc, input.getType(), scanOp.getInitialValue());
-          }
-        }
-      } else {
-        Value y = inclusive ? input : lastInput;
-        output = genOperator(loc, lastOutput, y, scanOp.getKind(), rewriter);
-        assert(output != nullptr);
-      }
-      result = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, output, result, offsets, strides);
-      lastOutput = output;
-      lastInput = input;
-    }
-
-    Value reduction;
-    if (initialValueRank == 0) {
-      Value v = rewriter.create<vector::ExtractOp>(loc, lastOutput, 0);
-      reduction =
-          rewriter.create<vector::BroadcastOp>(loc, initialValueType, v);
-    } else {
-      reduction = rewriter.create<vector::ShapeCastOp>(loc, initialValueType,
-                                                       lastOutput);
-    }
-
-    rewriter.replaceOp(scanOp, {result, reduction});
-    return success();
-  }
-};
-
 /// Canonicalization of a `vector.contraction %a, %b, %c` with row-major matmul
 /// semantics to a contraction suitable for MMT (matrix matrix multiplication
 /// with the RHS transposed) lowering.
@@ -3157,132 +1125,6 @@ struct CanonicalizeContractMatmulToMMT final
   FilterConstraintType filter;
 };
 
-/// Flattens 2 or more dimensional `vector.gather` ops by unrolling the
-/// outermost dimension. For example:
-/// ```
-/// %g = vector.gather %base[%c0][%v], %mask, %pass_thru :
-///        ... into vector<2x3xf32>
-///
-/// ==>
-///
-/// %0   = arith.constant dense<0.0> : vector<2x3xf32>
-/// %g0  = vector.gather %base[%c0][%v0], %mask0, %pass_thru0 : ...
-/// %1   = vector.insert %g0, %0 [0] : vector<3xf32> into vector<2x3xf32>
-/// %g1  = vector.gather %base[%c0][%v1], %mask1, %pass_thru1 : ...
-/// %g   = vector.insert %g1, %1 [1] : vector<3xf32> into vector<2x3xf32>
-/// ```
-///
-/// When applied exhaustively, this will produce a sequence of 1-d gather ops.
-struct FlattenGather : OpRewritePattern<vector::GatherOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::GatherOp op,
-                                PatternRewriter &rewriter) const override {
-    VectorType resultTy = op.getType();
-    if (resultTy.getRank() < 2)
-      return rewriter.notifyMatchFailure(op, "already flat");
-
-    Location loc = op.getLoc();
-    Value indexVec = op.getIndexVec();
-    Value maskVec = op.getMask();
-    Value passThruVec = op.getPassThru();
-
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resultTy, rewriter.getZeroAttr(resultTy));
-
-    Type subTy = VectorType::get(resultTy.getShape().drop_front(),
-                                 resultTy.getElementType());
-
-    for (int64_t i = 0, e = resultTy.getShape().front(); i < e; ++i) {
-      int64_t thisIdx[1] = {i};
-
-      Value indexSubVec =
-          rewriter.create<vector::ExtractOp>(loc, indexVec, thisIdx);
-      Value maskSubVec =
-          rewriter.create<vector::ExtractOp>(loc, maskVec, thisIdx);
-      Value passThruSubVec =
-          rewriter.create<vector::ExtractOp>(loc, passThruVec, thisIdx);
-      Value subGather = rewriter.create<vector::GatherOp>(
-          loc, subTy, op.getBase(), op.getIndices(), indexSubVec, maskSubVec,
-          passThruSubVec);
-      result =
-          rewriter.create<vector::InsertOp>(loc, subGather, result, thisIdx);
-    }
-
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
-/// Turns 1-d `vector.gather` into a scalarized sequence of `vector.loads` or
-/// `tensor.extract`s. To avoid out-of-bounds memory accesses, these
-/// loads/extracts are made conditional using `scf.if` ops.
-struct Gather1DToConditionalLoads : OpRewritePattern<vector::GatherOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::GatherOp op,
-                                PatternRewriter &rewriter) const override {
-    VectorType resultTy = op.getType();
-    if (resultTy.getRank() != 1)
-      return rewriter.notifyMatchFailure(op, "unsupported rank");
-
-    Location loc = op.getLoc();
-    Type elemTy = resultTy.getElementType();
-    // Vector type with a single element. Used to generate `vector.loads`.
-    VectorType elemVecTy = VectorType::get({1}, elemTy);
-
-    Value condMask = op.getMask();
-    Value base = op.getBase();
-    Value indexVec = rewriter.createOrFold<arith::IndexCastOp>(
-        loc, op.getIndexVectorType().clone(rewriter.getIndexType()),
-        op.getIndexVec());
-    auto baseOffsets = llvm::to_vector(op.getIndices());
-    Value lastBaseOffset = baseOffsets.back();
-
-    Value result = op.getPassThru();
-
-    // Emit a conditional access for each vector element.
-    for (int64_t i = 0, e = resultTy.getNumElements(); i < e; ++i) {
-      int64_t thisIdx[1] = {i};
-      Value condition =
-          rewriter.create<vector::ExtractOp>(loc, condMask, thisIdx);
-      Value index = rewriter.create<vector::ExtractOp>(loc, indexVec, thisIdx);
-      baseOffsets.back() =
-          rewriter.createOrFold<arith::AddIOp>(loc, lastBaseOffset, index);
-
-      auto loadBuilder = [&](OpBuilder &b, Location loc) {
-        Value extracted;
-        if (isa<MemRefType>(base.getType())) {
-          // `vector.load` does not support scalar result; emit a vector load
-          // and extract the single result instead.
-          Value load =
-              b.create<vector::LoadOp>(loc, elemVecTy, base, baseOffsets);
-          int64_t zeroIdx[1] = {0};
-          extracted = b.create<vector::ExtractOp>(loc, load, zeroIdx);
-        } else {
-          extracted = b.create<tensor::ExtractOp>(loc, base, baseOffsets);
-        }
-
-        Value newResult =
-            b.create<vector::InsertOp>(loc, extracted, result, thisIdx);
-        b.create<scf::YieldOp>(loc, newResult);
-      };
-      auto passThruBuilder = [result](OpBuilder &b, Location loc) {
-        b.create<scf::YieldOp>(loc, result);
-      };
-
-      result =
-          rewriter
-              .create<scf::IfOp>(loc, condition, /*thenBuilder=*/loadBuilder,
-                                 /*elseBuilder=*/passThruBuilder)
-              .getResult(0);
-    }
-
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
 } // namespace
 
 void mlir::vector::populateVectorMaskMaterializationPatterns(
@@ -3307,33 +1149,6 @@ void mlir::vector::populateBubbleVectorBitCastOpPatterns(
                                                      benefit);
 }
 
-void mlir::vector::populateVectorBroadcastLoweringPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<BroadcastOpLowering>(patterns.getContext(), benefit);
-}
-
-void mlir::vector::populateVectorMaskOpLoweringPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<CreateMaskOpLowering, ConstantMaskOpLowering>(
-      patterns.getContext(), benefit);
-}
-
-void mlir::vector::populateVectorShapeCastLoweringPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<ShapeCastOp2DDownCastRewritePattern,
-               ShapeCastOp2DUpCastRewritePattern, ShapeCastOpRewritePattern>(
-      patterns.getContext(), benefit);
-}
-
-void mlir::vector::populateVectorContractLoweringPatterns(
-    RewritePatternSet &patterns, VectorTransformsOptions options,
-    PatternBenefit benefit) {
-  patterns.add<OuterProductOpLowering>(patterns.getContext(), benefit);
-  patterns.add<ContractionOpLowering, ContractionOpToMatmulOpLowering,
-               ContractionOpToOuterProductOpLowering>(
-      options, patterns.getContext(), benefit);
-}
-
 void mlir::vector::populateVectorContractCanonicalizeMatmulToMMT(
     RewritePatternSet &patterns,
     std::function<LogicalResult(vector::ContractionOp)> constraint,
@@ -3342,13 +1157,6 @@ void mlir::vector::populateVectorContractCanonicalizeMatmulToMMT(
                                                 std::move(constraint));
 }
 
-void mlir::vector::populateVectorTransposeLoweringPatterns(
-    RewritePatternSet &patterns, VectorTransformsOptions options,
-    PatternBenefit benefit) {
-  patterns.add<TransposeOpLowering, TransposeOp2DToShuffleLowering>(
-      options, patterns.getContext(), benefit);
-}
-
 void mlir::vector::populateVectorReductionToContractPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns.add<MultiReduceToContract, CombineContractBroadcast,
@@ -3363,28 +1171,6 @@ void mlir::vector::
   patterns.add<DropInnerMostUnitDims>(patterns.getContext(), benefit);
 }
 
-void mlir::vector::populateVectorTransferLoweringPatterns(
-    RewritePatternSet &patterns, std::optional<unsigned> maxTransferRank,
-    PatternBenefit benefit) {
-  patterns.add<TransferReadToVectorLoadLowering,
-               TransferWriteToVectorStoreLowering>(patterns.getContext(),
-                                                   maxTransferRank, benefit);
-  patterns
-      .add<VectorLoadToMemrefLoadLowering, VectorStoreToMemrefStoreLowering>(
-          patterns.getContext(), benefit);
-}
-
-void mlir::vector::populateVectorScanLoweringPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<ScanToArithOps>(patterns.getContext(), benefit);
-}
-
-void mlir::vector::populateVectorGatherLoweringPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<FlattenGather, Gather1DToConditionalLoads>(patterns.getContext(),
-                                                          benefit);
-}
-
 //===----------------------------------------------------------------------===//
 // TableGen'd enum attribute definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index f79ca2259fa38..7a4f9cf5e5101 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
@@ -148,8 +149,9 @@ struct TestVectorContractionLowering
     if (lowerToOuterProduct) {
       VectorContractLowering lowering = VectorContractLowering::OuterProduct;
       VectorTransformsOptions options{lowering};
-      patterns.add<ContractionOpToOuterProductOpLowering>(options,
-                                                          &getContext());
+      populateVectorContractLoweringPatterns(
+          patterns, options, /*benefit=*/1,
+          /*disableOuterProductlowering=*/true);
       (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
       return;
     }
@@ -469,7 +471,7 @@ struct TestVectorTransferFullPartialSplitPatterns
       options.setVectorTransferSplit(VectorTransferSplit::LinalgCopy);
     else
       options.setVectorTransferSplit(VectorTransferSplit::VectorTransfer);
-    patterns.add<VectorTransferFullPartialRewriter>(ctx, options);
+    populateVectorTransferFullPartialPatterns(patterns, options);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 8538c3db59dcd..f565030d63d9f 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -8539,6 +8539,7 @@ cc_library(
         ":TransformDialect",
         ":TransformDialectUtils",
         ":TransformUtils",
+        ":VectorTransforms",
         "//llvm:Support",
     ],
 )

From a86cc8341de91c48ff724aa07766bc0dbefaa248 Mon Sep 17 00:00:00 2001
From: Alex Brachet <abrachet@google.com>
Date: Thu, 23 Mar 2023 18:43:09 +0000
Subject: [PATCH 164/208] [libc] Move fma and fmaf into generic dir

Differential Revision: https://reviews.llvm.org/D146740
---
 libc/src/math/CMakeLists.txt         | 27 +++------------------------
 libc/src/math/generic/CMakeLists.txt | 24 ++++++++++++++++++++++++
 libc/src/math/{ => generic}/fma.cpp  |  0
 libc/src/math/{ => generic}/fmaf.cpp |  0
 4 files changed, 27 insertions(+), 24 deletions(-)
 rename libc/src/math/{ => generic}/fma.cpp (100%)
 rename libc/src/math/{ => generic}/fmaf.cpp (100%)

diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 78bab469c28db..bc9a5d7a237f4 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -40,30 +40,6 @@ function(add_math_entrypoint_object name)
   )
 endfunction()
 
-add_entrypoint_object(
-  fmaf
-  SRCS
-    fmaf.cpp
-  HDRS
-    fmaf.h
-  DEPENDS
-    libc.src.__support.FPUtil.fma
-  COMPILE_OPTIONS
-    -O3
-)
-
-add_entrypoint_object(
-  fma
-  SRCS
-    fma.cpp
-  HDRS
-    fma.h
-  DEPENDS
-    libc.src.__support.FPUtil.fma
-  COMPILE_OPTIONS
-    -O3
-)
-
 add_math_entrypoint_object(acosf)
 add_math_entrypoint_object(acoshf)
 
@@ -107,6 +83,9 @@ add_math_entrypoint_object(floor)
 add_math_entrypoint_object(floorf)
 add_math_entrypoint_object(floorl)
 
+add_math_entrypoint_object(fma)
+add_math_entrypoint_object(fmaf)
+
 add_math_entrypoint_object(fmax)
 add_math_entrypoint_object(fmaxf)
 add_math_entrypoint_object(fmaxl)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 09aefc67d1b51..9fe0fce1c8a67 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1491,3 +1491,27 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
 )
+
+add_entrypoint_object(
+  fmaf
+  SRCS
+    fmaf.cpp
+  HDRS
+    ../fmaf.h
+  DEPENDS
+    libc.src.__support.FPUtil.fma
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fma
+  SRCS
+    fma.cpp
+  HDRS
+    ../fma.h
+  DEPENDS
+    libc.src.__support.FPUtil.fma
+  COMPILE_OPTIONS
+    -O3
+)
diff --git a/libc/src/math/fma.cpp b/libc/src/math/generic/fma.cpp
similarity index 100%
rename from libc/src/math/fma.cpp
rename to libc/src/math/generic/fma.cpp
diff --git a/libc/src/math/fmaf.cpp b/libc/src/math/generic/fmaf.cpp
similarity index 100%
rename from libc/src/math/fmaf.cpp
rename to libc/src/math/generic/fmaf.cpp

From 30e89166d765cbe676fdc85bc653df9a3c09ce48 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Mar 2023 19:36:29 +0000
Subject: [PATCH 165/208] [X86] combineVectorSizedSetCCEquality - update
 arguments to use individual SETCC operands. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a87dc476a1849..b9ccb5b2c48dc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53970,7 +53970,7 @@ static bool isOrXorXorTree(SDValue X, bool Root = true) {
 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
 /// expansion.
 template <typename F>
-static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
+static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
                                 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
   SDValue Op0 = X.getOperand(0);
   SDValue Op1 = X.getOperand(1);
@@ -53997,14 +53997,14 @@ static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
 
 /// Try to map a 128-bit or larger integer comparison to vector instructions
 /// before type legalization splits it up into chunks.
-static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
+static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
+                                               ISD::CondCode CC,
+                                               const SDLoc &DL,
+                                               SelectionDAG &DAG,
                                                const X86Subtarget &Subtarget) {
-  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
 
   // We're looking for an oversized integer equality comparison.
-  SDValue X = SetCC->getOperand(0);
-  SDValue Y = SetCC->getOperand(1);
   EVT OpVT = X.getValueType();
   unsigned OpSize = OpVT.getSizeInBits();
   if (!OpVT.isScalarInteger() || OpSize < 128)
@@ -54029,9 +54029,6 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
       !IsOrXorXorTreeCCZero)
     return SDValue();
 
-  EVT VT = SetCC->getValueType(0);
-  SDLoc DL(SetCC);
-
   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
   // Otherwise use PCMPEQ (plus AND) and mask testing.
@@ -54173,7 +54170,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   SDLoc DL(N);
 
   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
-    if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
+    if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
+                                                    Subtarget))
       return V;
 
     if (VT == MVT::i1 && isNullConstant(RHS)) {

From e0361396c2281a108a36d186161ace1843925431 Mon Sep 17 00:00:00 2001
From: Chia-hung Duan <chiahungduan@google.com>
Date: Thu, 23 Mar 2023 19:38:48 +0000
Subject: [PATCH 166/208] [scudo] Add a Timer class to assist performance
 measurement

Add Timer and TimingManager which provide convenient way to meause the
execution time of code snippets. The output looks like,

```
-- Average Operation Time -- -- Name (# of Calls) --
          1747.2(ns)            popBatch (59)
            92.3(ns)            popBatchImpl (73)
           101.6(ns)              EmptyBatchProcess (5)
          2587.0(ns)            pushBlocksImpl (13)
```

Note that `EmptyBatchProcess` is nested under the timer `popBatchImpl`.

Reviewed By: cferris

Differential Revision: https://reviews.llvm.org/D143626
---
 .../lib/scudo/standalone/CMakeLists.txt       |   2 +
 .../lib/scudo/standalone/tests/CMakeLists.txt |   1 +
 .../scudo/standalone/tests/timing_test.cpp    |  86 +++++++
 compiler-rt/lib/scudo/standalone/timing.cpp   |  29 +++
 compiler-rt/lib/scudo/standalone/timing.h     | 215 ++++++++++++++++++
 5 files changed, 333 insertions(+)
 create mode 100644 compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
 create mode 100644 compiler-rt/lib/scudo/standalone/timing.cpp
 create mode 100644 compiler-rt/lib/scudo/standalone/timing.h

diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
index eefcffd4cfc56..6fcd4deddf716 100644
--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@@ -85,6 +85,7 @@ set(SCUDO_HEADERS
   stack_depot.h
   stats.h
   string_utils.h
+  timing.h
   tsd_exclusive.h
   tsd_shared.h
   tsd.h
@@ -107,6 +108,7 @@ set(SCUDO_SOURCES
   report.cpp
   rss_limit_checker.cpp
   string_utils.cpp
+  timing.cpp
   )
 
 # Enable the necessary instruction set for scudo_crc32.cpp, if available.
diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
index 50468d9c6ddc3..335e4b7dbd899 100644
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@@ -105,6 +105,7 @@ set(SCUDO_UNIT_TEST_SOURCES
   size_class_map_test.cpp
   stats_test.cpp
   strings_test.cpp
+  timing_test.cpp
   tsd_test.cpp
   vector_test.cpp
   scudo_unit_test_main.cpp
diff --git a/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp b/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
new file mode 100644
index 0000000000000..09a6c31224673
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
@@ -0,0 +1,86 @@
+//===-- timing_test.cpp -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tests/scudo_unit_test.h"
+
+#include "timing.h"
+
+#include <string>
+
+class ScudoTimingTest : public Test {
+public:
+  void testFunc1() { scudo::ScopedTimer ST(Manager, __func__); }
+
+  void testFunc2() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    testFunc1();
+  }
+
+  void testChainedCalls() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    testFunc2();
+  }
+
+  void testIgnoredTimer() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    ST.ignore();
+  }
+
+  void printAllTimersStats() { Manager.printAll(); }
+
+  scudo::TimingManager &getTimingManager() { return Manager; }
+
+private:
+  scudo::TimingManager Manager;
+};
+
+// Given that the output of statistics of timers are dumped through
+// `scudo::Printf` which is platform dependent, so we don't have a reliable way
+// to catch the output and verify the details. Now we only verify the number of
+// invocations on linux.
+TEST_F(ScudoTimingTest, SimpleTimer) {
+#if SCUDO_LINUX
+  testing::internal::LogToStderr();
+  testing::internal::CaptureStderr();
+#endif
+
+  testIgnoredTimer();
+  testChainedCalls();
+  printAllTimersStats();
+
+#if SCUDO_LINUX
+  std::string output = testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(output.find("testIgnoredTimer (1)") == std::string::npos);
+  EXPECT_TRUE(output.find("testChainedCalls (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("testFunc2 (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("testFunc1 (1)") != std::string::npos);
+#endif
+}
+
+TEST_F(ScudoTimingTest, NestedTimer) {
+#if SCUDO_LINUX
+  testing::internal::LogToStderr();
+  testing::internal::CaptureStderr();
+#endif
+
+  {
+    scudo::ScopedTimer Outer(getTimingManager(), "Outer");
+    {
+      scudo::ScopedTimer Inner1(getTimingManager(), Outer, "Inner1");
+      { scudo::ScopedTimer Inner2(getTimingManager(), Inner1, "Inner2"); }
+    }
+  }
+  printAllTimersStats();
+
+#if SCUDO_LINUX
+  std::string output = testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(output.find("Outer (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("Inner1 (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("Inner2 (1)") != std::string::npos);
+#endif
+}
diff --git a/compiler-rt/lib/scudo/standalone/timing.cpp b/compiler-rt/lib/scudo/standalone/timing.cpp
new file mode 100644
index 0000000000000..59ae21d10f0f6
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/timing.cpp
@@ -0,0 +1,29 @@
+//===-- timing.cpp ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "timing.h"
+
+namespace scudo {
+
+Timer::~Timer() {
+  if (Manager)
+    Manager->report(*this);
+}
+
+ScopedTimer::ScopedTimer(TimingManager &Manager, const char *Name)
+    : Timer(Manager.getOrCreateTimer(Name)) {
+  start();
+}
+
+ScopedTimer::ScopedTimer(TimingManager &Manager, const Timer &Nest,
+                         const char *Name)
+    : Timer(Manager.nest(Nest, Name)) {
+  start();
+}
+
+} // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/timing.h b/compiler-rt/lib/scudo/standalone/timing.h
new file mode 100644
index 0000000000000..155111f9f8e52
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/timing.h
@@ -0,0 +1,215 @@
+//===-- timing.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "common.h"
+#include "mutex.h"
+#include "string_utils.h"
+#include "thread_annotations.h"
+
+#include <string.h>
+
+namespace scudo {
+
+class TimingManager;
+
+// A simple timer for evaluating execution time of code snippets. It can be used
+// along with TimingManager or standalone.
+class Timer {
+public:
+  // The use of Timer without binding to a TimingManager is supposed to do the
+  // timer logging manually. Otherwise, TimingManager will do the logging stuff
+  // for you.
+  Timer() = default;
+  Timer(Timer &&Other)
+      : StartTime(0), AccTime(Other.AccTime), Manager(Other.Manager),
+        HandleId(Other.HandleId) {
+    Other.Manager = nullptr;
+  }
+
+  Timer(const Timer &) = delete;
+
+  virtual ~Timer();
+
+  void start() {
+    CHECK_EQ(StartTime, 0U);
+    StartTime = getMonotonicTime();
+  }
+  void stop() {
+    AccTime += getMonotonicTime() - StartTime;
+    StartTime = 0;
+  }
+  u64 getAccumulatedTime() const { return AccTime; }
+
+  // Unset the bound TimingManager so that we don't report the data back. This
+  // is useful if we only want to track subset of certain scope events.
+  void ignore() {
+    StartTime = 0;
+    AccTime = 0;
+    Manager = nullptr;
+  }
+
+protected:
+  friend class TimingManager;
+  Timer(TimingManager &Manager, u32 HandleId)
+      : Manager(&Manager), HandleId(HandleId) {}
+
+  u64 StartTime = 0;
+  u64 AccTime = 0;
+  TimingManager *Manager = nullptr;
+  u32 HandleId;
+};
+
+// A RAII-style wrapper for easy scope execution measurement. Note that in order
+// not to take additional space for the message like `Name`. It only works with
+// TimingManager.
+class ScopedTimer : public Timer {
+public:
+  ScopedTimer(TimingManager &Manager, const char *Name);
+  ScopedTimer(TimingManager &Manager, const Timer &Nest, const char *Name);
+  ~ScopedTimer() override { stop(); }
+};
+
+// In Scudo, the execution time of single run of code snippets may not be
+// useful, we are more interested in the average time from several runs.
+// TimingManager lets the registered timer report their data and reports the
+// average execution time for each timer periodically.
+class TimingManager {
+public:
+  TimingManager(u32 PrintingInterval = DefaultPrintingInterval)
+      : PrintingInterval(PrintingInterval) {}
+  ~TimingManager() {
+    if (NumAllocatedTimers != 0)
+      printAll();
+  }
+
+  Timer getOrCreateTimer(const char *Name) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+
+    CHECK_LT(strlen(Name), MaxLenOfTimerName);
+    for (u32 I = 0; I < NumAllocatedTimers; ++I) {
+      if (strncmp(Name, Timers[I].Name, MaxLenOfTimerName) == 0)
+        return Timer(*this, I);
+    }
+
+    CHECK_LT(NumAllocatedTimers, MaxNumberOfTimers);
+    strncpy(Timers[NumAllocatedTimers].Name, Name, MaxLenOfTimerName);
+    TimerRecords[NumAllocatedTimers].AccumulatedTime = 0;
+    TimerRecords[NumAllocatedTimers].Occurrence = 0;
+    return Timer(*this, NumAllocatedTimers++);
+  }
+
+  // Add a sub-Timer associated with another Timer. This is used when we want to
+  // detail the execution time in the scope of a Timer.
+  // For example,
+  //   void Foo() {
+  //     // T1 records the time spent in both first and second tasks.
+  //     ScopedTimer T1(getTimingManager(), "Task1");
+  //     {
+  //       // T2 records the time spent in first task
+  //       ScopedTimer T2(getTimingManager, T1, "Task2");
+  //       // Do first task.
+  //     }
+  //     // Do second task.
+  //   }
+  //
+  // The report will show proper indents to indicate the nested relation like,
+  //   -- Average Operation Time -- -- Name (# of Calls) --
+  //             10.0(ns)            Task1 (1)
+  //              5.0(ns)              Task2 (1)
+  Timer nest(const Timer &T, const char *Name) EXCLUDES(Mutex) {
+    CHECK_EQ(T.Manager, this);
+    Timer Nesting = getOrCreateTimer(Name);
+
+    ScopedLock L(Mutex);
+    CHECK_NE(Nesting.HandleId, T.HandleId);
+    Timers[Nesting.HandleId].Nesting = T.HandleId;
+    return Nesting;
+  }
+
+  void report(const Timer &T) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+
+    const u32 HandleId = T.HandleId;
+    CHECK_LT(HandleId, MaxNumberOfTimers);
+    TimerRecords[HandleId].AccumulatedTime += T.getAccumulatedTime();
+    ++TimerRecords[HandleId].Occurrence;
+    ++NumEventsReported;
+    if (NumEventsReported % PrintingInterval == 0)
+      printAllImpl();
+  }
+
+  void printAll() EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    printAllImpl();
+  }
+
+private:
+  void printAllImpl() REQUIRES(Mutex) {
+    static char NameHeader[] = "-- Name (# of Calls) --";
+    static char AvgHeader[] = "-- Average Operation Time --";
+    ScopedString Str;
+    Str.append("%-15s %-15s\n", AvgHeader, NameHeader);
+
+    for (u32 I = 0; I < NumAllocatedTimers; ++I) {
+      if (Timers[I].Nesting != MaxNumberOfTimers)
+        continue;
+      printImpl(Str, I);
+    }
+
+    Str.output();
+  }
+
+  void printImpl(ScopedString &Str, const u32 HandleId,
+                 const u32 ExtraIndent = 0) REQUIRES(Mutex) {
+    const uptr AccumulatedTime = TimerRecords[HandleId].AccumulatedTime;
+    const uptr Occurrence = TimerRecords[HandleId].Occurrence;
+    const uptr Integral = Occurrence == 0 ? 0 : AccumulatedTime / Occurrence;
+    // Only keep single digit of fraction is enough and it enables easier layout
+    // maintenance.
+    const uptr Fraction =
+        Occurrence == 0 ? 0
+                        : ((AccumulatedTime % Occurrence) * 10) / Occurrence;
+
+    Str.append("%14zu.%zu(ns) %-11s", Integral, Fraction, " ");
+
+    for (u32 I = 0; I < ExtraIndent; ++I)
+      Str.append("%s", "  ");
+    Str.append("%s (%zu)\n", Timers[HandleId].Name, Occurrence);
+
+    for (u32 I = 0; I < NumAllocatedTimers; ++I)
+      if (Timers[I].Nesting == HandleId)
+        printImpl(Str, I, ExtraIndent + 1);
+  }
+
+  // Instead of maintaining pages for timer registration, a static buffer is
+  // sufficient for most use cases in Scudo.
+  static constexpr u32 MaxNumberOfTimers = 50;
+  static constexpr u32 MaxLenOfTimerName = 50;
+  static constexpr u32 DefaultPrintingInterval = 100;
+
+  struct Record {
+    uptr AccumulatedTime = 0;
+    uptr Occurrence = 0;
+  };
+
+  struct TimerInfo {
+    char Name[MaxLenOfTimerName + 1];
+    u32 Nesting = MaxNumberOfTimers;
+  };
+
+  HybridMutex Mutex;
+  // The frequency of proactively dumping the timer statistics. For example, the
+  // default setting is to dump the statistics every 100 reported events.
+  u32 PrintingInterval GUARDED_BY(Mutex);
+  uptr NumEventsReported GUARDED_BY(Mutex) = 0;
+  u32 NumAllocatedTimers GUARDED_BY(Mutex) = 0;
+  TimerInfo Timers[MaxNumberOfTimers] GUARDED_BY(Mutex);
+  Record TimerRecords[MaxNumberOfTimers] GUARDED_BY(Mutex);
+};
+
+} // namespace scudo

From d10110a8a60137d430f7a75051d0794293982ef6 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Thu, 23 Mar 2023 12:38:57 -0700
Subject: [PATCH 167/208] [StackProtector] attribute __stack_chk_fail as
 NoReturn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When GCC added support for stack smashing protections, it was defined
that:

> This hook returns a CALL_EXPR that alerts the runtime that the stack
> protect guard variable has been modified. This expression should
> involve a call to a noreturn function.
> The default version of this hook invokes a function called
> ‘__stack_chk_fail’, taking no arguments.

Do so as well for __stack_smash_handler for OpenBSD.

Every libc implementation I could find has __stack_chk_fail marked
noreturn, or the implementation calls abort, exit, or panic (which
themselves are noreturn).

Glibc: https://sourceware.org/git/?p=glibc.git;a=blob;f=debug/stack_chk_fail.c
Musl: https://git.musl-libc.org/cgit/musl/tree/src/env/__stack_chk_fail.c
Bionic: https://android.googlesource.com/platform/bionic/+/refs/heads/master/libc/bionic/__stack_chk_fail.cpp
FreeBSD: https://cgit.freebsd.org/src/tree/lib/libc/secure/stack_protector.c
OpenBSD: https://github.com/openbsd/src/blob/master/lib/libc/sys/stack_protector.c
NetBSD: https://github.com/NetBSD/src/blob/trunk/lib/libc/misc/stack_protector.c
Linux Kernel: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/panic.c
Apple: https://opensource.apple.com/source/Libc/Libc-1439.40.11/sys/OpenBSD/stack_protector.c.auto.html

Link: https://gcc.gnu.org/onlinedocs/gccint/Stack-Smashing-Protection.html#Stack-Smashing-Protection

This will later help us diagnose functions that fall through to other
functions vs end in calls to functions that are noreturn.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D146339
---
 llvm/lib/CodeGen/StackProtector.cpp            | 18 ++++++++++--------
 .../test/CodeGen/X86/2009-04-14-IllegalRegs.ll |  1 -
 .../test/CodeGen/X86/stack-protector-weight.ll |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index aa92dcb386560..05ac176461a5c 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -620,18 +621,19 @@ BasicBlock *StackProtector::CreateFailBB() {
   if (F->getSubprogram())
     B.SetCurrentDebugLocation(
         DILocation::get(Context, 0, 0, F->getSubprogram()));
+  FunctionCallee StackChkFail;
+  SmallVector<Value *, 1> Args;
   if (Trip.isOSOpenBSD()) {
-    FunctionCallee StackChkFail = M->getOrInsertFunction(
-        "__stack_smash_handler", Type::getVoidTy(Context),
-        Type::getInt8PtrTy(Context));
-
-    B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
+    StackChkFail = M->getOrInsertFunction("__stack_smash_handler",
+                                          Type::getVoidTy(Context),
+                                          Type::getInt8PtrTy(Context));
+    Args.push_back(B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
-    FunctionCallee StackChkFail =
+    StackChkFail =
         M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
-
-    B.CreateCall(StackChkFail, {});
   }
+  cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn);
+  B.CreateCall(StackChkFail, Args);
   B.CreateUnreachable();
   return FailBB;
 }
diff --git a/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll b/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
index 5e3bea0a83c24..da8e7b16a0cef 100644
--- a/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
+++ b/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
@@ -53,7 +53,6 @@ define i32 @z() nounwind ssp {
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_3: ## %CallStackCheckFailBlk
 ; CHECK-NEXT:    calll ___stack_chk_fail
-; CHECK-NEXT:    ud2
 entry:
 	%retval = alloca i32		; <ptr> [#uses=2]
 	%xxx = alloca %struct.X		; <ptr> [#uses=6]
diff --git a/llvm/test/CodeGen/X86/stack-protector-weight.ll b/llvm/test/CodeGen/X86/stack-protector-weight.ll
index 0b7620fdee657..862b130bfa4c6 100644
--- a/llvm/test/CodeGen/X86/stack-protector-weight.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-weight.ll
@@ -10,7 +10,7 @@
 ; DARWIN-SELDAG: bb.[[SUCCESS]]{{[0-9a-zA-Z_.]+}}:
 
 ; DARWIN-IR: # Machine code for function test_branch_weights:
-; DARWIN-IR: successors: %bb.[[SUCCESS:[0-9]+]](0x7fffffff), %bb.[[FAILURE:[0-9]+]]
+; DARWIN-IR: successors: %bb.[[SUCCESS:[0-9]+]](0x7ffff800), %bb.[[FAILURE:[0-9]+]]
 ; DARWIN-IR: bb.[[SUCCESS]]{{[0-9a-zA-Z_.]+}}:
 ; DARWIN-IR: bb.[[FAILURE]]{{[0-9a-zA-Z_.]+}}:
 ; DARWIN-IR: CALL64pcrel32 @__stack_chk_fail

From 0c5cee779929f840f4f286c5894a01f583ee7b4a Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Wed, 22 Mar 2023 16:17:49 -0700
Subject: [PATCH 168/208] [lldb-server] Use Platform plugin corresponding to
 the host

In ee232506b870ce5282cc4da5ca493d41d361feb3 I moved UnixSignal
initialization from lldbTarget to the various platform plugins. This
inadvertently broke lldb-server because lldb-server doesn't use
Platform plugins. lldb-server still needs to be able to create a
UnixSignals object for the host platform so we can add the relevant
platform plugin to lldb-server to make sure we always have a
HostPlatform.

Differential Revision: https://reviews.llvm.org/D146668
---
 .../inferior-crashing/TestInferiorCrashing.py |  4 +++-
 lldb/tools/lldb-server/CMakeLists.txt         |  9 +++++++++
 .../lldb-server/SystemInitializerLLGS.cpp     | 19 +++++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/functionalities/inferior-crashing/TestInferiorCrashing.py b/lldb/test/API/functionalities/inferior-crashing/TestInferiorCrashing.py
index b63a09d047024..172c00eb59dc2 100644
--- a/lldb/test/API/functionalities/inferior-crashing/TestInferiorCrashing.py
+++ b/lldb/test/API/functionalities/inferior-crashing/TestInferiorCrashing.py
@@ -63,7 +63,9 @@ def inferior_crashing(self):
         # The exact stop reason depends on the platform
         if self.platformIsDarwin():
             stop_reason = 'stop reason = EXC_BAD_ACCESS'
-        elif self.getPlatform() == "linux" or self.getPlatform() == "freebsd":
+        elif self.getPlatform() == "linux":
+            stop_reason = 'stop reason = signal SIGSEGV: address not mapped to object'
+        elif self.getPlatform() == "freebsd":
             stop_reason = 'stop reason = signal SIGSEGV'
         else:
             stop_reason = 'stop reason = invalid address'
diff --git a/lldb/tools/lldb-server/CMakeLists.txt b/lldb/tools/lldb-server/CMakeLists.txt
index 67103e87a1d4a..56da4c8b56807 100644
--- a/lldb/tools/lldb-server/CMakeLists.txt
+++ b/lldb/tools/lldb-server/CMakeLists.txt
@@ -7,20 +7,29 @@ set(LLDB_PLUGINS)
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
   list(APPEND LLDB_PLUGINS lldbPluginProcessLinux)
+  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    list(APPEND LLDB_PLUGINS lldbPluginPlatformLinux)
+  else()
+    list(APPEND LLDB_PLUGINS lldbPluginPlatformAndroid)
+  endif()
 endif()
 
 if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
   list(APPEND LLDB_PLUGINS lldbPluginProcessFreeBSD)
+  list(APPEND LLDB_PLUGINS lldbPluginPlatformFreeBSD)
 endif()
 
 if(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
   list(APPEND LLDB_PLUGINS lldbPluginProcessNetBSD)
+  list(APPEND LLDB_PLUGINS lldbPluginPlatformNetBSD)
 endif()
 
 if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
   list(APPEND LLDB_PLUGINS lldbPluginObjectFileMachO)
+  list(APPEND LLDB_PLUGINS lldbPluginPlatformMacOSX)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
   list(APPEND LLDB_PLUGINS lldbPluginObjectFilePECOFF)
+  list(APPEND LLDB_PLUGINS lldbPluginPlatformWindows)
 else()
   list(APPEND LLDB_PLUGINS lldbPluginObjectFileELF)
 endif()
diff --git a/lldb/tools/lldb-server/SystemInitializerLLGS.cpp b/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
index 4233252a84dfc..1909ea4dc7984 100644
--- a/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
+++ b/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
@@ -11,12 +11,29 @@
 #if defined(__APPLE__)
 #include "Plugins/ObjectFile/Mach-O/ObjectFileMachO.h"
 using HostObjectFile = ObjectFileMachO;
+#include "Plugins/Platform/MacOSX/PlatformMacOSX.h"
+using HostPlatform = lldb_private::PlatformMacOSX;
 #elif defined(_WIN32)
 #include "Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h"
 using HostObjectFile = ObjectFilePECOFF;
+#include "Plugins/Platform/Windows/PlatformWindows.h"
+using HostPlatform = lldb_private::PlatformWindows;
 #else
 #include "Plugins/ObjectFile/ELF/ObjectFileELF.h"
 using HostObjectFile = ObjectFileELF;
+#if defined(__ANDROID__)
+#include "Plugins/Platform/Android/PlatformAndroid.h"
+using HostPlatform = lldb_private::platform_android::PlatformAndroid;
+#elif defined(__FreeBSD__)
+#include "Plugins/Platform/FreeBSD/PlatformFreeBSD.h"
+using HostPlatform = lldb_private::platform_freebsd::PlatformFreeBSD;
+#elif defined(__linux__)
+#include "Plugins/Platform/Linux/PlatformLinux.h"
+using HostPlatform = lldb_private::platform_linux::PlatformLinux;
+#elif defined(__NetBSD__)
+#include "Plugins/Platform/NetBSD/PlatformNetBSD.h"
+using HostPlatform = lldb_private::platform_netbsd::PlatformNetBSD;
+#endif
 #endif
 
 #if defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
@@ -58,6 +75,7 @@ llvm::Error SystemInitializerLLGS::Initialize() {
     return e;
 
   HostObjectFile::Initialize();
+  HostPlatform::Initialize();
 
 #if defined(LLDB_TARGET_ARM) || defined(LLDB_TARGET_ARM64)
   EmulateInstructionARM::Initialize();
@@ -80,6 +98,7 @@ llvm::Error SystemInitializerLLGS::Initialize() {
 
 void SystemInitializerLLGS::Terminate() {
   HostObjectFile::Terminate();
+  HostPlatform::Terminate();
 
 #if defined(LLDB_TARGET_ARM) || defined(LLDB_TARGET_ARM64)
   EmulateInstructionARM::Terminate();

From 4b398ec456cdba142251918267c605d49ca7c6ef Mon Sep 17 00:00:00 2001
From: Kirill Stoimenov <kstoimenov@google.com>
Date: Thu, 23 Mar 2023 20:25:47 +0000
Subject: [PATCH 169/208] [HWASAN] Fix decorate_proc_maps to work with HWASAN

---
 .../sanitizer_common/TestCases/Linux/decorate_proc_maps.cpp   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/decorate_proc_maps.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/decorate_proc_maps.cpp
index 60f32c2d83dbf..d0400afe743b3 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/decorate_proc_maps.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/decorate_proc_maps.cpp
@@ -49,6 +49,10 @@ int main(void) {
 // CHECK-asan: ---p {{.*}} [shadow gap]
 // CHECK-asan: rw-p {{.*}} [high shadow]
 
+// CHECK-hwasan: rw-p {{.*}} [low shadow]
+// CHECK-hwasan: ---p {{.*}} [shadow gap]
+// CHECK-hwasan: rw-p {{.*}} [high shadow]
+
 // CHECK-msan: ---p {{.*}} [invalid]
 // CHECK-msan: rw-p {{.*}} [shadow{{.*}}]
 // CHECK-msan: ---p {{.*}} [origin{{.*}}]

From 1d30afdc2d97d348b971ae48716a0e79c6c5a29e Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Thu, 23 Mar 2023 13:22:32 -0700
Subject: [PATCH 170/208] [PATCH] Enable targeting riscv64-linux-android

Reviewers: ccross, asb, phosek, enh, srhines, hiraditya

Putting: https://android.googlesource.com/toolchain/llvm_android/+/refs/heads/master/patches/Enable-targeting-riscv64-linux-android.patch for review.

Differential Revision: https://reviews.llvm.org/D146560
---
 clang/lib/Driver/ToolChains/Linux.cpp                | 2 ++
 compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake | 2 +-
 compiler-rt/lib/sanitizer_common/sanitizer_linux.h   | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 848d7247c20c0..e25895fd0b636 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -126,6 +126,8 @@ std::string Linux::getMultiarchTriple(const Driver &D,
   case llvm::Triple::ppc64le:
     return "powerpc64le-linux-gnu";
   case llvm::Triple::riscv64:
+    if (IsAndroid)
+      return "riscv64-linux-android";
     return "riscv64-linux-gnu";
   case llvm::Triple::sparc:
     return "sparc-linux-gnu";
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index e3fe5570de26a..99d672de4e882 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -42,7 +42,7 @@ if(OS_NAME MATCHES "Linux")
 elseif (OS_NAME MATCHES "Windows")
   set(ALL_FUZZER_SUPPORTED_ARCH ${X86} ${X86_64})
 elseif(OS_NAME MATCHES "Android")
-  set(ALL_FUZZER_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64})
+  set(ALL_FUZZER_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64})
 else()
   set(ALL_FUZZER_SUPPORTED_ARCH ${X86_64} ${ARM64})
 endif()
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
index 2c769dd59aa09..c84c04a877594 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
@@ -152,6 +152,9 @@ inline void ReleaseMemoryPagesToOSAndZeroFill(uptr beg, uptr end) {
                "rdhwr   %0,$29\n" \
                ".set    pop\n" : "=r"(__v)); \
        __v; })
+#elif defined (__riscv)
+# define __get_tls() \
+    ({ void** __v; __asm__("mv %0, tp" : "=r"(__v)); __v; })
 #elif defined(__i386__)
 # define __get_tls() \
     ({ void** __v; __asm__("movl %%gs:0, %0" : "=r"(__v)); __v; })

From 156d966ec47041a071022460d68d94717460fa5a Mon Sep 17 00:00:00 2001
From: AdityaK <1894981+hiraditya@users.noreply.github.com>
Date: Tue, 21 Mar 2023 15:42:25 -0700
Subject: [PATCH 171/208] Remove mips target triple for Android

Reviewers: enh, phosek, srhines, MaskRay

thanks to @enh for pointing these out.

Differential Revision: https://reviews.llvm.org/D146565
---
 clang/lib/Driver/ToolChains/Gnu.cpp          | 19 ------
 clang/lib/Driver/ToolChains/Linux.cpp        |  4 --
 clang/test/Driver/android-ndk-standalone.cpp | 16 -----
 clang/test/Driver/android-pie.c              | 10 ---
 clang/test/Driver/android-standalone.cpp     | 48 -------------
 clang/test/Driver/clang-translation.c        | 18 -----
 clang/test/Driver/linux-ld.c                 | 72 --------------------
 clang/test/Driver/pic.c                      |  3 -
 8 files changed, 190 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 7e72a1d1433da..0c8868109f7ee 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -2431,9 +2431,6 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
     static const char *const AArch64AndroidTriples[] = {
         "aarch64-linux-android"};
     static const char *const ARMAndroidTriples[] = {"arm-linux-androideabi"};
-    static const char *const MIPSELAndroidTriples[] = {"mipsel-linux-android"};
-    static const char *const MIPS64ELAndroidTriples[] = {
-        "mips64el-linux-android"};
     static const char *const X86AndroidTriples[] = {"i686-linux-android"};
     static const char *const X86_64AndroidTriples[] = {"x86_64-linux-android"};
 
@@ -2448,22 +2445,6 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
       LibDirs.append(begin(ARMLibDirs), end(ARMLibDirs));
       TripleAliases.append(begin(ARMAndroidTriples), end(ARMAndroidTriples));
       break;
-    case llvm::Triple::mipsel:
-      LibDirs.append(begin(MIPSELLibDirs), end(MIPSELLibDirs));
-      TripleAliases.append(begin(MIPSELAndroidTriples),
-                           end(MIPSELAndroidTriples));
-      BiarchLibDirs.append(begin(MIPS64ELLibDirs), end(MIPS64ELLibDirs));
-      BiarchTripleAliases.append(begin(MIPS64ELAndroidTriples),
-                                 end(MIPS64ELAndroidTriples));
-      break;
-    case llvm::Triple::mips64el:
-      LibDirs.append(begin(MIPS64ELLibDirs), end(MIPS64ELLibDirs));
-      TripleAliases.append(begin(MIPS64ELAndroidTriples),
-                           end(MIPS64ELAndroidTriples));
-      BiarchLibDirs.append(begin(MIPSELLibDirs), end(MIPSELLibDirs));
-      BiarchTripleAliases.append(begin(MIPSELAndroidTriples),
-                                 end(MIPSELAndroidTriples));
-      break;
     case llvm::Triple::x86_64:
       LibDirs.append(begin(X86_64LibDirs), end(X86_64LibDirs));
       TripleAliases.append(begin(X86_64AndroidTriples),
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index e25895fd0b636..77ad9605addab 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -92,8 +92,6 @@ std::string Linux::getMultiarchTriple(const Driver &D,
   case llvm::Triple::mips:
     return IsMipsR6 ? "mipsisa32r6-linux-gnu" : "mips-linux-gnu";
   case llvm::Triple::mipsel:
-    if (IsAndroid)
-      return "mipsel-linux-android";
     return IsMipsR6 ? "mipsisa32r6el-linux-gnu" : "mipsel-linux-gnu";
   case llvm::Triple::mips64: {
     std::string MT = std::string(IsMipsR6 ? "mipsisa64r6" : "mips64") +
@@ -105,8 +103,6 @@ std::string Linux::getMultiarchTriple(const Driver &D,
     break;
   }
   case llvm::Triple::mips64el: {
-    if (IsAndroid)
-      return "mips64el-linux-android";
     std::string MT = std::string(IsMipsR6 ? "mipsisa64r6el" : "mips64el") +
                      "-linux-" + (IsMipsN32Abi ? "gnuabin32" : "gnuabi64");
     if (D.getVFS().exists(concat(SysRoot, "/lib", MT)))
diff --git a/clang/test/Driver/android-ndk-standalone.cpp b/clang/test/Driver/android-ndk-standalone.cpp
index aeb2678c9caae..397460dbd7803 100644
--- a/clang/test/Driver/android-ndk-standalone.cpp
+++ b/clang/test/Driver/android-ndk-standalone.cpp
@@ -246,22 +246,6 @@
 // CHECK-ARM64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9/../../../../aarch64-linux-android/lib"
 //
 // RUN: %clang -### %s 2>&1 \
-// RUN:     --target=mipsel-linux-android21 \
-// RUN:     -mips32 \
-// RUN:     --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \
-// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-MIPS %s
-// CHECK-MIPS: "-cc1"
-// CHECK-MIPS: "-internal-isystem" "{{.*}}/include/c++/v1"
-// CHECK-MIPS: "-internal-externc-isystem" "{{.*}}/sysroot/include"
-// CHECK-MIPS: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
-// CHECK-MIPS: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-MIPS: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9"
-// CHECK-MIPS: "-L{{.*}}/sysroot/usr/lib/mipsel-linux-android/21"
-// CHECK-MIPS: "-L{{.*}}/sysroot/usr/lib/mipsel-linux-android"
-// CHECK-MIPS: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/../../../../mipsel-linux-android/lib"
-//
-// RUN: %clang -### %s 2>&1 \
 // RUN:     --target=i686-linux-android21 \
 // RUN:     --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
diff --git a/clang/test/Driver/android-pie.c b/clang/test/Driver/android-pie.c
index c006b90891e89..8620e18565458 100644
--- a/clang/test/Driver/android-pie.c
+++ b/clang/test/Driver/android-pie.c
@@ -8,11 +8,6 @@
 // RUN: %clang %s -### -o %t.o 2>&1 --target=arm-linux-android24 \
 // RUN:   | FileCheck --check-prefix=PIE %s
 
-// RUN: %clang %s -### -o %t.o 2>&1 --target=mipsel-linux-android \
-// RUN:   | FileCheck --check-prefix=PIE %s
-// RUN: %clang %s -### -o %t.o 2>&1 --target=mipsel-linux-android24 \
-// RUN:   | FileCheck --check-prefix=PIE %s
-
 // RUN: %clang %s -### -o %t.o 2>&1 --target=i686-linux-android \
 // RUN:   | FileCheck --check-prefix=PIE %s
 // RUN: %clang %s -### -o %t.o 2>&1 --target=i686-linux-android24 \
@@ -28,11 +23,6 @@
 // RUN: %clang %s -### -o %t.o 2>&1 --target=arm64-linux-android24 \
 // RUN:   | FileCheck --check-prefix=PIE %s
 
-// RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-linux-android \
-// RUN:   | FileCheck --check-prefix=PIE %s
-// RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-linux-android24 \
-// RUN:   | FileCheck --check-prefix=PIE %s
-
 // RUN: %clang %s -### -o %t.o 2>&1 --target=x86_64-linux-android \
 // RUN:   | FileCheck --check-prefix=PIE %s
 // RUN: %clang %s -### -o %t.o 2>&1 --target=x86_64-linux-android24 \
diff --git a/clang/test/Driver/android-standalone.cpp b/clang/test/Driver/android-standalone.cpp
index 7363497c880a5..0246d1371deb9 100644
--- a/clang/test/Driver/android-standalone.cpp
+++ b/clang/test/Driver/android-standalone.cpp
@@ -45,51 +45,3 @@
 // CHECK-ARM64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.8"
 // CHECK-ARM64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.8/../../../../aarch64-linux-android/lib"
 // CHECK-ARM64: "-L{{.*}}/sysroot/usr/lib"
-//
-// RUN: %clang -### %s 2>&1 \
-// RUN:     --target=mipsel-linux-android \
-// RUN:     -mips32 -stdlib=libstdc++ \
-// RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-MIPS %s
-// CHECK-MIPS: "-cc1"
-// CHECK-MIPS: "-internal-isystem" "{{.*}}/mipsel-linux-android/include/c++/4.4.3"
-// CHECK-MIPS: "-internal-isystem" "{{.*}}/mipsel-linux-android/include/c++/4.4.3/mipsel-linux-android"
-// CHECK-MIPS: "-internal-externc-isystem" "{{.*}}/sysroot/include"
-// CHECK-MIPS: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
-// CHECK-MIPS: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-MIPS: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3"
-// CHECK-MIPS: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/../../../../mipsel-linux-android/lib"
-// CHECK-MIPS: "-L{{.*}}/sysroot/usr/lib"
-//
-// RUN: %clang -### %s 2>&1 \
-// RUN:     --target=mipsel-linux-android \
-// RUN:     -march=mips32 -mips32r2 -stdlib=libstdc++ \
-// RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-MIPSR2 %s
-// CHECK-MIPSR2: "-cc1"
-// CHECK-MIPSR2: "-internal-isystem" "{{.*}}/mipsel-linux-android/include/c++/4.4.3"
-// CHECK-MIPSR2: "-internal-isystem" "{{.*}}/mipsel-linux-android/include/c++/4.4.3/mipsel-linux-android"
-// CHECK-MIPSR2: "-internal-externc-isystem" "{{.*}}/sysroot/include"
-// CHECK-MIPSR2: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
-// CHECK-MIPSR2: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-MIPSR2: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/mips-r2"
-// CHECK-MIPSR2: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/../../../../mipsel-linux-android/lib"
-// CHECK-MIPSR2: "-L{{.*}}/sysroot/usr/lib"
-//
-// RUN: %clang -### %s 2>&1 \
-// RUN:     --target=mipsel-linux-android \
-// RUN:     -mips32 -march=mips32r2 -stdlib=libstdc++ \
-// RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-MIPSR2-A %s
-// CHECK-MIPSR2-A: "-cc1"
-// CHECK-MIPSR2-A: "-internal-isystem" "{{.*}}/mipsel-linux-android/include/c++/4.4.3"
-// CHECK-MIPSR2-A: "-internal-isystem" "{{.*}}/mipsel-linux-android/include/c++/4.4.3/mipsel-linux-android"
-// CHECK-MIPSR2-A: "-internal-externc-isystem" "{{.*}}/sysroot/include"
-// CHECK-MIPSR2-A: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
-// CHECK-MIPSR2-A: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-MIPSR2-A: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/mips-r2"
-// CHECK-MIPSR2-A: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.4.3/../../../../mipsel-linux-android/lib"
-// CHECK-MIPSR2-A: "-L{{.*}}/sysroot/usr/lib"
diff --git a/clang/test/Driver/clang-translation.c b/clang/test/Driver/clang-translation.c
index ca98ca5e8228d..058ac32bbdb50 100644
--- a/clang/test/Driver/clang-translation.c
+++ b/clang/test/Driver/clang-translation.c
@@ -392,24 +392,6 @@
 // MIPSR6EL: "-target-cpu" "mips32r6"
 // MIPSR6EL: "-mfloat-abi" "hard"
 
-// RUN: %clang -target mipsel-linux-android -### -S %s 2>&1 | \
-// RUN: FileCheck -check-prefix=MIPSEL-ANDROID %s
-// MIPSEL-ANDROID: clang
-// MIPSEL-ANDROID: "-cc1"
-// MIPSEL-ANDROID: "-target-cpu" "mips32"
-// MIPSEL-ANDROID: "-target-feature" "+fpxx"
-// MIPSEL-ANDROID: "-target-feature" "+nooddspreg"
-// MIPSEL-ANDROID: "-mfloat-abi" "hard"
-
-// RUN: %clang -target mipsel-linux-android -### -S %s -mcpu=mips32r6 2>&1 | \
-// RUN: FileCheck -check-prefix=MIPSEL-ANDROID-R6 %s
-// MIPSEL-ANDROID-R6: clang
-// MIPSEL-ANDROID-R6: "-cc1"
-// MIPSEL-ANDROID-R6: "-target-cpu" "mips32r6"
-// MIPSEL-ANDROID-R6: "-target-feature" "+fp64"
-// MIPSEL-ANDROID-R6: "-target-feature" "+nooddspreg"
-// MIPSEL-ANDROID-R6: "-mfloat-abi" "hard"
-
 // RUN: %clang -target mips64-linux-gnu -### -S %s 2>&1 | \
 // RUN: FileCheck -check-prefix=MIPS64 %s
 // MIPS64: clang
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index 10e6bee183050..be1230ac0ab63 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -1046,16 +1046,6 @@
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mipsel-linux-android -rtlib=platform --unwindlib=platform \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
-// RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mips64el-linux-android -rtlib=platform --unwindlib=platform \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID %s
-// RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-linux-android -rtlib=platform --unwindlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
@@ -1101,18 +1091,6 @@
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mipsel-linux-android -rtlib=platform --unwindlib=platform \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
-// RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mips64el-linux-android -rtlib=platform --unwindlib=platform \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-SO %s
-// RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-linux-android -rtlib=platform --unwindlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
@@ -1158,18 +1136,6 @@
 // RUN:     -static \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mipsel-linux-android -rtlib=platform --unwindlib=platform \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     -static \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
-// RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mips64el-linux-android -rtlib=platform --unwindlib=platform \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     -static \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-STATIC %s
-// RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-linux-android -rtlib=platform --unwindlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
@@ -1216,18 +1182,6 @@
 // RUN:     -pie \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mipsel-linux-android -rtlib=platform --unwindlib=platform \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     -pie \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
-// RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mips64el-linux-android -rtlib=platform --unwindlib=platform \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     -pie \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PIE %s
-// RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-linux-android -rtlib=platform --unwindlib=platform \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
@@ -1259,11 +1213,6 @@
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-32 %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mipsel-linux-android \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-32 %s
-// RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=aarch64-linux-android \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
@@ -1313,15 +1262,6 @@
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PTHREAD %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mipsel-linux-android -pthread \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PTHREAD %s
-// RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mips64el-linux-android -pthread \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PTHREAD %s
-// RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-linux-android -pthread \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
@@ -1356,18 +1296,6 @@
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PTHREAD %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mipsel-linux-android -pthread \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PTHREAD %s
-// RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mips64el-linux-android -pthread \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-PTHREAD %s
-// RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-linux-android -pthread \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
diff --git a/clang/test/Driver/pic.c b/clang/test/Driver/pic.c
index b05f363603a2a..daa3a55430068 100644
--- a/clang/test/Driver/pic.c
+++ b/clang/test/Driver/pic.c
@@ -280,9 +280,6 @@
 // RUN: %clang -c %s -target arm-linux-androideabi24 -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 //
-// RUN: %clang -c %s -target mipsel-linux-android24 -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-PIE1
-//
 // 64-bit Android targets are always PIE.
 // RUN: %clang -c %s -target aarch64-linux-android -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIE2

From f23dcb2f2a4728b812d70ead630798a205e77d60 Mon Sep 17 00:00:00 2001
From: Gulfem Savrun Yeniceri <gulfem@google.com>
Date: Thu, 23 Mar 2023 20:54:21 +0000
Subject: [PATCH 172/208] Revert "[JITLink] Initial AArch32 backend"

This reverts commit c2de8ff92753acdb1ace7a27cc11cb09f28eb8fa.
It caused a segmentation fault while running ExecutionEngine
tests on Mac.
https://luci-milo.appspot.com/ui/p/fuchsia/builders/toolchain.ci/clang-mac-x64/b8785839382041226465/overview
---
 .../ExecutionEngine/JITLink/ELF_aarch32.h     |  38 --
 .../llvm/ExecutionEngine/JITLink/aarch32.h    | 293 ----------
 .../ExecutionEngine/JITLink/CMakeLists.txt    |   2 -
 llvm/lib/ExecutionEngine/JITLink/ELF.cpp      |   9 -
 .../JITLink/ELFLinkGraphBuilder.h             |  21 -
 .../ExecutionEngine/JITLink/ELF_aarch32.cpp   | 299 ----------
 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp  | 514 ------------------
 .../Orc/ObjectLinkingLayer.cpp                |   6 +-
 .../JITLink/AArch32/ELF_thumbv7_printf.s      |  46 --
 .../JITLink/AArch32/lit.local.cfg             |   2 -
 .../ExecutionEngine/JITLink/AArch32Tests.cpp  | 200 -------
 .../ExecutionEngine/JITLink/CMakeLists.txt    |   1 -
 12 files changed, 1 insertion(+), 1430 deletions(-)
 delete mode 100644 llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch32.h
 delete mode 100644 llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
 delete mode 100644 llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
 delete mode 100644 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
 delete mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_thumbv7_printf.s
 delete mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/lit.local.cfg
 delete mode 100644 llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch32.h
deleted file mode 100644
index 25d1c3aac2c26..0000000000000
--- a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch32.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===---- ELF_aarch32.h - JIT link functions for arm/thumb -----*- C++ -*--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// jit-link functions for ELF/aarch32.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH32
-#define LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH32
-
-#include "llvm/ExecutionEngine/JITLink/JITLink.h"
-#include "llvm/ExecutionEngine/JITLink/aarch32.h"
-
-namespace llvm {
-namespace jitlink {
-
-/// Create a LinkGraph from an ELF/arm relocatable object
-///
-/// Note: The graph does not take ownership of the underlying buffer, nor copy
-/// its contents. The caller is responsible for ensuring that the object buffer
-/// outlives the graph.
-Expected<std::unique_ptr<LinkGraph>>
-createLinkGraphFromELFObject_aarch32(MemoryBufferRef ObjectBuffer);
-
-/// jit-link the given object buffer, which must be an ELF arm/thumb object
-/// file.
-void link_ELF_aarch32(std::unique_ptr<LinkGraph> G,
-                      std::unique_ptr<JITLinkContext> Ctx);
-
-} // end namespace jitlink
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH32
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
deleted file mode 100644
index 8488b10278771..0000000000000
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ /dev/null
@@ -1,293 +0,0 @@
-//===------ aarch32.h - Generic JITLink arm/thumb utilities -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Generic utilities for graphs representing arm/thumb objects.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_JITLINK_AARCH32
-#define LLVM_EXECUTIONENGINE_JITLINK_AARCH32
-
-#include "TableManager.h"
-#include "llvm/ExecutionEngine/JITLink/JITLink.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
-#include "llvm/Support/ARMBuildAttributes.h"
-#include "llvm/Support/Error.h"
-
-namespace llvm {
-namespace jitlink {
-namespace aarch32 {
-
-/// JITLink-internal AArch32 fixup kinds
-enum EdgeKind_aarch32 : Edge::Kind {
-
-  ///
-  /// Relocations of class Data
-  ///
-  FirstDataRelocation = Edge::FirstRelocation,
-
-  /// Plain 32-bit value relocation in target endianness
-  Data_Delta32 = FirstDataRelocation,
-
-  LastDataRelocation = Data_Delta32,
-
-  ///
-  /// Relocations of class Arm (covers fixed-width 4-byte instruction subset)
-  ///
-  FirstArmRelocation,
-
-  /// TODO: Arm_Call is here only as a placeholder for now.
-  Arm_Call = FirstArmRelocation,
-
-  LastArmRelocation = Arm_Call,
-
-  ///
-  /// Relocations of class Thumb16 and Thumb32 (covers Thumb instruction subset)
-  ///
-  FirstThumbRelocation,
-
-  /// Write immediate value for PC-relative branch with link (can bridge between
-  /// Arm and Thumb).
-  Thumb_Call = FirstThumbRelocation,
-
-  /// Write immediate value for (unconditional) PC-relative branch without link.
-  Thumb_Jump24,
-
-  /// Write immediate value to the lower halfword of the destination register
-  Thumb_MovwAbsNC,
-
-  /// Write immediate value to the top halfword of the destination register
-  Thumb_MovtAbs,
-
-  LastThumbRelocation = Thumb_MovtAbs,
-};
-
-/// Flags enum for AArch32-specific symbol properties
-enum TargetFlags_aarch32 : TargetFlagsType {
-  ThumbSymbol = 1 << 0,
-};
-
-/// Human-readable name for a given CPU architecture kind
-const char *getCPUArchName(ARMBuildAttrs::CPUArch K);
-
-/// Get a human-readable name for the given AArch32 edge kind.
-const char *getEdgeKindName(Edge::Kind K);
-
-/// AArch32 uses stubs for a number of purposes, like branch range extension
-/// or interworking between Arm and Thumb instruction subsets.
-///
-/// Stub implementations vary depending on CPU architecture (v4, v6, v7),
-/// instruction subset and branch type (absolute/PC-relative).
-///
-/// For each kind of stub, the StubsFlavor defines one concrete form that is
-/// used throughout the LinkGraph.
-///
-/// Stubs are often called "veneers" in the official docs and online.
-///
-enum StubsFlavor {
-  Unsupported = 0,
-  Thumbv7,
-};
-
-/// JITLink sub-arch configuration for Arm CPU models
-struct ArmConfig {
-  bool J1J2BranchEncoding = false;
-  StubsFlavor Stubs = Unsupported;
-};
-
-/// Obtain the sub-arch configuration for a given Arm CPU model.
-inline ArmConfig getArmConfigForCPUArch(ARMBuildAttrs::CPUArch CPUArch) {
-  ArmConfig ArmCfg;
-  switch (CPUArch) {
-  case ARMBuildAttrs::v7:
-  case ARMBuildAttrs::v8_A:
-    ArmCfg.J1J2BranchEncoding = true;
-    ArmCfg.Stubs = Thumbv7;
-    break;
-  default:
-    DEBUG_WITH_TYPE("jitlink", {
-      dbgs() << "  Warning: ARM config not defined for CPU architecture "
-             << getCPUArchName(CPUArch);
-    });
-    break;
-  }
-  return ArmCfg;
-}
-
-/// Immutable pair of halfwords, Hi and Lo, with overflow check
-struct HalfWords {
-  constexpr HalfWords() : Hi(0), Lo(0) {}
-  constexpr HalfWords(uint32_t Hi, uint32_t Lo) : Hi(Hi), Lo(Lo) {
-    assert(isUInt<16>(Hi) && "Overflow in first half-word");
-    assert(isUInt<16>(Lo) && "Overflow in second half-word");
-  }
-  const uint16_t Hi; // First halfword
-  const uint16_t Lo; // Second halfword
-};
-
-/// Collection of named constants per fixup kind. It may contain but is not
-/// limited to the following entries:
-///
-///   Opcode      - Values of the op-code bits in the instruction, with
-///                 unaffected bits nulled
-///   OpcodeMask  - Mask with all bits set that encode the op-code
-///   ImmMask     - Mask with all bits set that encode the immediate value
-///   RegMask     - Mask with all bits set that encode the register
-///
-template <EdgeKind_aarch32 Kind> struct FixupInfo {};
-
-template <> struct FixupInfo<Thumb_Jump24> {
-  static constexpr HalfWords Opcode{0xf000, 0x8000};
-  static constexpr HalfWords OpcodeMask{0xf800, 0x8000};
-  static constexpr HalfWords ImmMask{0x07ff, 0x2fff};
-  static constexpr uint16_t LoBitConditional = 0x1000;
-};
-
-template <> struct FixupInfo<Thumb_Call> {
-  static constexpr HalfWords Opcode{0xf000, 0xc000};
-  static constexpr HalfWords OpcodeMask{0xf800, 0xc000};
-  static constexpr HalfWords ImmMask{0x07ff, 0x2fff};
-  static constexpr uint16_t LoBitH = 0x0001;
-  static constexpr uint16_t LoBitNoBlx = 0x1000;
-};
-
-template <> struct FixupInfo<Thumb_MovtAbs> {
-  static constexpr HalfWords Opcode{0xf2c0, 0x0000};
-  static constexpr HalfWords OpcodeMask{0xfbf0, 0x8000};
-  static constexpr HalfWords ImmMask{0x040f, 0x70ff};
-  static constexpr HalfWords RegMask{0x0000, 0x0f00};
-};
-
-template <>
-struct FixupInfo<Thumb_MovwAbsNC> : public FixupInfo<Thumb_MovtAbs> {
-  static constexpr HalfWords Opcode{0xf240, 0x0000};
-};
-
-/// Helper function to read the initial addend for Data-class relocations.
-Expected<int64_t> readAddendData(LinkGraph &G, Block &B, const Edge &E);
-
-/// Helper function to read the initial addend for Arm-class relocations.
-Expected<int64_t> readAddendArm(LinkGraph &G, Block &B, const Edge &E);
-
-/// Helper function to read the initial addend for Thumb-class relocations.
-Expected<int64_t> readAddendThumb(LinkGraph &G, Block &B, const Edge &E,
-                                  const ArmConfig &ArmCfg);
-
-/// Read the initial addend for a REL-type relocation. It's the value encoded
-/// in the immediate field of the fixup location by the compiler.
-inline Expected<int64_t> readAddend(LinkGraph &G, Block &B, const Edge &E,
-                                    const ArmConfig &ArmCfg) {
-  Edge::Kind Kind = E.getKind();
-  if (Kind <= LastDataRelocation)
-    return readAddendData(G, B, E);
-
-  if (Kind <= LastArmRelocation)
-    return readAddendArm(G, B, E);
-
-  if (Kind <= LastThumbRelocation)
-    return readAddendThumb(G, B, E, ArmCfg);
-
-  llvm_unreachable("Relocation must be of class Data, Arm or Thumb");
-}
-
-/// Helper function to apply the fixup for Data-class relocations.
-Error applyFixupData(LinkGraph &G, Block &B, const Edge &E);
-
-/// Helper function to apply the fixup for Arm-class relocations.
-Error applyFixupArm(LinkGraph &G, Block &B, const Edge &E);
-
-/// Helper function to apply the fixup for Thumb-class relocations.
-Error applyFixupThumb(LinkGraph &G, Block &B, const Edge &E,
-                      const ArmConfig &ArmCfg);
-
-/// Apply fixup expression for edge to block content.
-inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
-                        const ArmConfig &ArmCfg) {
-  Edge::Kind Kind = E.getKind();
-
-  if (Kind <= LastDataRelocation)
-    return applyFixupData(G, B, E);
-
-  if (Kind <= LastArmRelocation)
-    return applyFixupArm(G, B, E);
-
-  if (Kind <= LastThumbRelocation)
-    return applyFixupThumb(G, B, E, ArmCfg);
-
-  llvm_unreachable("Relocation must be of class Data, Arm or Thumb");
-}
-
-/// Stubs builder for a specific StubsFlavor
-///
-/// Right now we only have one default stub kind, but we want to extend this
-/// and allow creation of specific kinds in the future (e.g. branch range
-/// extension or interworking).
-///
-/// Let's keep it simple for the moment and not wire this through a GOT.
-///
-template <StubsFlavor Flavor>
-class StubsManager : public TableManager<StubsManager<Flavor>> {
-public:
-  StubsManager() = default;
-
-  /// Name of the object file section that will contain all our stubs.
-  static StringRef getSectionName() { return "__llvm_jitlink_STUBS"; }
-
-  /// Implements link-graph traversal via visitExistingEdges().
-  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
-    if (E.getTarget().isDefined())
-      return false;
-
-    switch (E.getKind()) {
-    case Thumb_Call:
-    case Thumb_Jump24: {
-      DEBUG_WITH_TYPE("jitlink", {
-        dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
-               << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
-               << formatv("{0:x}", E.getOffset()) << ")\n";
-      });
-      E.setTarget(this->getEntryForTarget(G, E.getTarget()));
-      return true;
-    }
-    }
-    return false;
-  }
-
-  /// Create a branch range extension stub for the class's flavor.
-  Symbol &createEntry(LinkGraph &G, Symbol &Target);
-
-private:
-  /// Create a new node in the link-graph for the given stub template.
-  template <size_t Size>
-  Block &addStub(LinkGraph &G, const uint8_t (&Code)[Size],
-                 uint64_t Alignment) {
-    ArrayRef<char> Template(reinterpret_cast<const char *>(Code), Size);
-    return G.createContentBlock(getStubsSection(G), Template,
-                                orc::ExecutorAddr(), Alignment, 0);
-  }
-
-  /// Get or create the object file section that will contain all our stubs.
-  Section &getStubsSection(LinkGraph &G) {
-    if (!StubsSection)
-      StubsSection = &G.createSection(getSectionName(),
-                                      orc::MemProt::Read | orc::MemProt::Exec);
-    return *StubsSection;
-  }
-
-  Section *StubsSection = nullptr;
-};
-
-/// Create a branch range extension stub with Thumb encoding for v7 CPUs.
-template <>
-Symbol &StubsManager<Thumbv7>::createEntry(LinkGraph &G, Symbol &Target);
-
-} // namespace aarch32
-} // namespace jitlink
-} // namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_JITLINK_AARCH32
diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
index bc86f45d3c185..52ff5e8370031 100644
--- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_component_library(LLVMJITLink
   # ELF
   ELF.cpp
   ELFLinkGraphBuilder.cpp
-  ELF_aarch32.cpp
   ELF_aarch64.cpp
   ELF_i386.cpp
   ELF_loongarch.cpp
@@ -34,7 +33,6 @@ add_llvm_component_library(LLVMJITLink
   COFF_x86_64.cpp
 
   # Architectures:
-  aarch32.cpp
   aarch64.cpp
   i386.cpp
   loongarch.cpp
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
index 340a0ce134475..ef0f19a785712 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -13,7 +13,6 @@
 #include "llvm/ExecutionEngine/JITLink/ELF.h"
 
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/ExecutionEngine/JITLink/ELF_aarch32.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_i386.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_loongarch.h"
@@ -70,8 +69,6 @@ createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer) {
   switch (*TargetMachineArch) {
   case ELF::EM_AARCH64:
     return createLinkGraphFromELFObject_aarch64(ObjectBuffer);
-  case ELF::EM_ARM:
-    return createLinkGraphFromELFObject_aarch32(ObjectBuffer);
   case ELF::EM_LOONGARCH:
     return createLinkGraphFromELFObject_loongarch(ObjectBuffer);
   case ELF::EM_RISCV:
@@ -93,12 +90,6 @@ void link_ELF(std::unique_ptr<LinkGraph> G,
   case Triple::aarch64:
     link_ELF_aarch64(std::move(G), std::move(Ctx));
     return;
-  case Triple::arm:
-  case Triple::armeb:
-  case Triple::thumb:
-  case Triple::thumbeb:
-    link_ELF_aarch32(std::move(G), std::move(Ctx));
-    return;
   case Triple::loongarch32:
   case Triple::loongarch64:
     link_ELF_loongarch(std::move(G), std::move(Ctx));
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index 1d98acf868695..9d2d4958dcf6c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -127,12 +127,6 @@ class ELFLinkGraphBuilder : public ELFLinkGraphBuilderBase {
   Error graphifySections();
   Error graphifySymbols();
 
-  /// Override in derived classes to suppress certain sections in the link
-  /// graph.
-  virtual bool excludeSection(const typename ELFT::Shdr &Sect) const {
-    return false;
-  }
-
   /// Traverse all matching ELFT::Rela relocation records in the given section.
   /// The handler function Func should be callable with this signature:
   ///   Error(const typename ELFT::Rela &,
@@ -327,13 +321,6 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() {
     auto Name = Obj.getSectionName(Sec, SectionStringTab);
     if (!Name)
       return Name.takeError();
-    if (excludeSection(Sec)) {
-      LLVM_DEBUG({
-        dbgs() << "    " << SecIndex << ": Skipping section \"" << *Name
-               << "\" explicitly\n";
-      });
-      continue;
-    }
 
     // Skip null sections.
     if (Sec.sh_type == ELF::SHT_NULL) {
@@ -577,10 +564,6 @@ Error ELFLinkGraphBuilder<ELFT>::forEachRelaRelocation(
     LLVM_DEBUG(dbgs() << "    skipped (dwarf section)\n\n");
     return Error::success();
   }
-  if (excludeSection(**FixupSection)) {
-    LLVM_DEBUG(dbgs() << "    skipped (fixup section excluded explicitly)\n\n");
-    return Error::success();
-  }
 
   // Lookup the link-graph node corresponding to the target section name.
   auto *BlockToFix = getGraphBlock(RelSect.sh_info);
@@ -627,10 +610,6 @@ Error ELFLinkGraphBuilder<ELFT>::forEachRelRelocation(
     LLVM_DEBUG(dbgs() << "    skipped (dwarf section)\n\n");
     return Error::success();
   }
-  if (excludeSection(**FixupSection)) {
-    LLVM_DEBUG(dbgs() << "    skipped (fixup section excluded explicitly)\n\n");
-    return Error::success();
-  }
 
   // Lookup the link-graph node corresponding to the target section name.
   auto *BlockToFix = getGraphBlock(RelSect.sh_info);
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
deleted file mode 100644
index 0010088fef1e7..0000000000000
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-//===----- ELF_aarch32.cpp - JIT linker implementation for arm/thumb ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// ELF/aarch32 jit-link implementation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JITLink/ELF_aarch32.h"
-
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/ExecutionEngine/JITLink/JITLink.h"
-#include "llvm/ExecutionEngine/JITLink/aarch32.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/TargetParser/ARMTargetParser.h"
-
-#include "ELFLinkGraphBuilder.h"
-#include "JITLinkGeneric.h"
-
-#define DEBUG_TYPE "jitlink"
-
-using namespace llvm::object;
-
-namespace llvm {
-namespace jitlink {
-
-/// Translate from ELF relocation type to JITLink-internal edge kind.
-Expected<aarch32::EdgeKind_aarch32> getJITLinkEdgeKind(uint32_t ELFType) {
-  switch (ELFType) {
-  case ELF::R_ARM_REL32:
-    return aarch32::Data_Delta32;
-  case ELF::R_ARM_CALL:
-    return aarch32::Arm_Call;
-  case ELF::R_ARM_THM_CALL:
-    return aarch32::Thumb_Call;
-  case ELF::R_ARM_THM_JUMP24:
-    return aarch32::Thumb_Jump24;
-  case ELF::R_ARM_THM_MOVW_ABS_NC:
-    return aarch32::Thumb_MovwAbsNC;
-  case ELF::R_ARM_THM_MOVT_ABS:
-    return aarch32::Thumb_MovtAbs;
-  }
-
-  return make_error<JITLinkError>(
-      "Unsupported aarch32 relocation " + formatv("{0:d}: ", ELFType) +
-      object::getELFRelocationTypeName(ELF::EM_ARM, ELFType));
-}
-
-/// Translate from JITLink-internal edge kind back to ELF relocation type.
-Expected<uint32_t> getELFRelocationType(Edge::Kind Kind) {
-  switch (static_cast<aarch32::EdgeKind_aarch32>(Kind)) {
-  case aarch32::Data_Delta32:
-    return ELF::R_ARM_REL32;
-  case aarch32::Arm_Call:
-    return ELF::R_ARM_CALL;
-  case aarch32::Thumb_Call:
-    return ELF::R_ARM_THM_CALL;
-  case aarch32::Thumb_Jump24:
-    return ELF::R_ARM_THM_JUMP24;
-  case aarch32::Thumb_MovwAbsNC:
-    return ELF::R_ARM_THM_MOVW_ABS_NC;
-  case aarch32::Thumb_MovtAbs:
-    return ELF::R_ARM_THM_MOVT_ABS;
-  }
-
-  return make_error<JITLinkError>(formatv("Invalid aarch32 edge {0:d}: ",
-                                          Kind));
-}
-
-/// Get a human-readable name for the given ELF AArch32 edge kind.
-const char *getELFAArch32EdgeKindName(Edge::Kind R) {
-  // No ELF-specific edge kinds yet
-  return aarch32::getEdgeKindName(R);
-}
-
-class ELFJITLinker_aarch32 : public JITLinker<ELFJITLinker_aarch32> {
-  friend class JITLinker<ELFJITLinker_aarch32>;
-
-public:
-  ELFJITLinker_aarch32(std::unique_ptr<JITLinkContext> Ctx,
-                       std::unique_ptr<LinkGraph> G, PassConfiguration PassCfg,
-                       aarch32::ArmConfig ArmCfg)
-      : JITLinker(std::move(Ctx), std::move(G), std::move(PassCfg)),
-        ArmCfg(std::move(ArmCfg)) {}
-
-private:
-  aarch32::ArmConfig ArmCfg;
-
-  Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
-    return aarch32::applyFixup(G, B, E, ArmCfg);
-  }
-};
-
-template <support::endianness DataEndianness>
-class ELFLinkGraphBuilder_aarch32
-    : public ELFLinkGraphBuilder<ELFType<DataEndianness, false>> {
-private:
-  using ELFT = ELFType<DataEndianness, false>;
-  using Base = ELFLinkGraphBuilder<ELFT>;
-
-  bool excludeSection(const typename ELFT::Shdr &Sect) const override {
-    // TODO: An .ARM.exidx (Exception Index table) entry is 8-bytes in size and
-    // consists of 2 words. It might be sufficient to process only relocations
-    // in the the second word (offset 4). Please find more details in: Exception
-    // Handling ABI for the Arm® Architecture -> Index table entries
-    if (Sect.sh_type == ELF::SHT_ARM_EXIDX)
-      return true;
-    return false;
-  }
-
-  Error addRelocations() override {
-    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
-    using Self = ELFLinkGraphBuilder_aarch32<DataEndianness>;
-    for (const auto &RelSect : Base::Sections) {
-      if (Error Err = Base::forEachRelRelocation(RelSect, this,
-                                                 &Self::addSingleRelRelocation))
-        return Err;
-    }
-    return Error::success();
-  }
-
-  Error addSingleRelRelocation(const typename ELFT::Rel &Rel,
-                               const typename ELFT::Shdr &FixupSect,
-                               Block &BlockToFix) {
-    uint32_t SymbolIndex = Rel.getSymbol(false);
-    auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec);
-    if (!ObjSymbol)
-      return ObjSymbol.takeError();
-
-    Symbol *GraphSymbol = Base::getGraphSymbol(SymbolIndex);
-    if (!GraphSymbol)
-      return make_error<StringError>(
-          formatv("Could not find symbol at given index, did you add it to "
-                  "JITSymbolTable? index: {0}, shndx: {1} Size of table: {2}",
-                  SymbolIndex, (*ObjSymbol)->st_shndx,
-                  Base::GraphSymbols.size()),
-          inconvertibleErrorCode());
-
-    uint32_t Type = Rel.getType(false);
-    Expected<aarch32::EdgeKind_aarch32> Kind = getJITLinkEdgeKind(Type);
-    if (!Kind)
-      return Kind.takeError();
-
-    auto FixupAddress = orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset;
-    Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
-    Edge E(*Kind, Offset, *GraphSymbol, 0);
-
-    Expected<int64_t> Addend =
-        aarch32::readAddend(*Base::G, BlockToFix, E, ArmCfg);
-    if (!Addend)
-      return Addend.takeError();
-
-    E.setAddend(*Addend);
-    LLVM_DEBUG({
-      dbgs() << "    ";
-      printEdge(dbgs(), BlockToFix, E, getELFAArch32EdgeKindName(*Kind));
-      dbgs() << "\n";
-    });
-
-    BlockToFix.addEdge(std::move(E));
-    return Error::success();
-  }
-
-  aarch32::ArmConfig ArmCfg;
-
-protected:
-  TargetFlagsType makeTargetFlags(const typename ELFT::Sym &Sym) override {
-    if (Sym.getValue() & 0x01)
-      return aarch32::ThumbSymbol;
-    return TargetFlagsType{};
-  }
-
-  orc::ExecutorAddrDiff getRawOffset(const typename ELFT::Sym &Sym,
-                                     TargetFlagsType Flags) override {
-    assert((makeTargetFlags(Sym) & Flags) == Flags);
-    static constexpr uint64_t ThumbBit = 0x01;
-    return Sym.getValue() & ~ThumbBit;
-  }
-
-public:
-  ELFLinkGraphBuilder_aarch32(StringRef FileName, const ELFFile<ELFT> &Obj,
-                              Triple TT, aarch32::ArmConfig ArmCfg)
-      : ELFLinkGraphBuilder<ELFT>(Obj, std::move(TT), FileName,
-                                  getELFAArch32EdgeKindName),
-        ArmCfg(std::move(ArmCfg)) {}
-};
-
-template <aarch32::StubsFlavor Flavor>
-Error buildTables_ELF_aarch32(LinkGraph &G) {
-  LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
-
-  aarch32::StubsManager<Flavor> PLT;
-  visitExistingEdges(G, PLT);
-  return Error::success();
-}
-
-Expected<std::unique_ptr<LinkGraph>>
-createLinkGraphFromELFObject_aarch32(MemoryBufferRef ObjectBuffer) {
-  LLVM_DEBUG({
-    dbgs() << "Building jitlink graph for new input "
-           << ObjectBuffer.getBufferIdentifier() << "...\n";
-  });
-
-  auto ELFObj = ObjectFile::createELFObjectFile(ObjectBuffer);
-  if (!ELFObj)
-    return ELFObj.takeError();
-
-  // Find out what exact AArch32 instruction set and features we target.
-  auto TT = (*ELFObj)->makeTriple();
-  ARM::ArchKind AK = ARM::parseArch(TT.getArchName());
-  if (AK == ARM::ArchKind::INVALID)
-    return make_error<JITLinkError>(
-        "Failed to build ELF link graph: Invalid ARM ArchKind");
-
-  // Resolve our internal configuration for the target. If at some point the
-  // CPUArch alone becomes too unprecise, we can find more details in the
-  // Tag_CPU_arch_profile.
-  aarch32::ArmConfig ArmCfg;
-  using namespace ARMBuildAttrs;
-  auto Arch = static_cast<CPUArch>(ARM::getArchAttr(AK));
-  switch (Arch) {
-  case v7:
-  case v8_A:
-    ArmCfg = aarch32::getArmConfigForCPUArch(Arch);
-    assert(ArmCfg.Stubs != aarch32::Unsupported &&
-           "Provide a config for each supported CPU");
-    break;
-  default:
-    return make_error<JITLinkError>(
-        "Failed to build ELF link graph: Unsupported CPU arch " +
-        StringRef(aarch32::getCPUArchName(Arch)));
-  }
-
-  // Populate the link-graph.
-  switch (TT.getArch()) {
-  case Triple::arm:
-  case Triple::thumb: {
-    auto &ELFFile = cast<ELFObjectFile<ELF32LE>>(**ELFObj).getELFFile();
-    return ELFLinkGraphBuilder_aarch32<support::little>(
-               (*ELFObj)->getFileName(), ELFFile, TT, ArmCfg)
-        .buildGraph();
-  }
-  case Triple::armeb:
-  case Triple::thumbeb: {
-    auto &ELFFile = cast<ELFObjectFile<ELF32BE>>(**ELFObj).getELFFile();
-    return ELFLinkGraphBuilder_aarch32<support::big>((*ELFObj)->getFileName(),
-                                                     ELFFile, TT, ArmCfg)
-        .buildGraph();
-  }
-  default:
-    return make_error<JITLinkError>(
-        "Failed to build ELF/aarch32 link graph: Invalid target triple " +
-        TT.getTriple());
-  }
-}
-
-void link_ELF_aarch32(std::unique_ptr<LinkGraph> G,
-                      std::unique_ptr<JITLinkContext> Ctx) {
-  const Triple &TT = G->getTargetTriple();
-
-  using namespace ARMBuildAttrs;
-  ARM::ArchKind AK = ARM::parseArch(TT.getArchName());
-  auto CPU = static_cast<CPUArch>(ARM::getArchAttr(AK));
-  aarch32::ArmConfig ArmCfg = aarch32::getArmConfigForCPUArch(CPU);
-
-  PassConfiguration PassCfg;
-  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
-    // Add a mark-live pass.
-    if (auto MarkLive = Ctx->getMarkLivePass(TT))
-      PassCfg.PrePrunePasses.push_back(std::move(MarkLive));
-    else
-      PassCfg.PrePrunePasses.push_back(markAllSymbolsLive);
-
-    switch (ArmCfg.Stubs) {
-    case aarch32::Thumbv7:
-      PassCfg.PostPrunePasses.push_back(
-          buildTables_ELF_aarch32<aarch32::Thumbv7>);
-      break;
-    case aarch32::Unsupported:
-      llvm_unreachable("Check before building graph");
-    }
-  }
-
-  if (auto Err = Ctx->modifyPassConfig(*G, PassCfg))
-    return Ctx->notifyFailed(std::move(Err));
-
-  ELFJITLinker_aarch32::link(std::move(Ctx), std::move(G), std::move(PassCfg),
-                             std::move(ArmCfg));
-}
-
-} // namespace jitlink
-} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
deleted file mode 100644
index 6f49a4578cf7c..0000000000000
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-//===--------- aarch32.cpp - Generic JITLink arm/thumb utilities ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Generic utilities for graphs representing arm/thumb objects.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JITLink/aarch32.h"
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/ExecutionEngine/JITLink/JITLink.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/MathExtras.h"
-
-#define DEBUG_TYPE "jitlink"
-
-namespace llvm {
-namespace jitlink {
-namespace aarch32 {
-
-using namespace support;
-using namespace support::endian;
-
-/// Encode 22-bit immediate value for branch instructions without J1J2 range
-/// extension (formats B T4, BL T1 and BLX T2).
-///
-///   00000:Imm11H:Imm11L:0 -> [ 00000:Imm11H, 00000:Imm11L ]
-///                                            J1^ ^J2 will always be 1
-///
-HalfWords encodeImmBT4BlT1BlxT2(int64_t Value) {
-  constexpr uint32_t J1J2 = 0x2800;
-  uint32_t Imm11H = (Value >> 12) & 0x07ff;
-  uint32_t Imm11L = (Value >> 1) & 0x07ff;
-  return HalfWords{Imm11H, Imm11L | J1J2};
-}
-
-/// Decode 22-bit immediate value for branch instructions without J1J2 range
-/// extension (formats B T4, BL T1 and BLX T2).
-///
-///   [ 00000:Imm11H, 00000:Imm11L ] -> 00000:Imm11H:Imm11L:0
-///                   J1^ ^J2 will always be 1
-///
-int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo) {
-  uint32_t Imm11H = Hi & 0x07ff;
-  uint32_t Imm11L = Lo & 0x07ff;
-  return SignExtend64<22>(Imm11H << 12 | Imm11L << 1);
-}
-
-/// Encode 25-bit immediate value for branch instructions with J1J2 range
-/// extension (formats B T4, BL T1 and BLX T2).
-///
-///   S:I1:I2:Imm10:Imm11:0 -> [ 00000:S:Imm10, 00:J1:0:J2:Imm11 ]
-///
-HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value) {
-  uint32_t S = (Value >> 14) & 0x0400;
-  uint32_t J1 = (((~(Value >> 10)) ^ (Value >> 11)) & 0x2000);
-  uint32_t J2 = (((~(Value >> 11)) ^ (Value >> 13)) & 0x0800);
-  uint32_t Imm10 = (Value >> 12) & 0x03ff;
-  uint32_t Imm11 = (Value >> 1) & 0x07ff;
-  return HalfWords{S | Imm10, J1 | J2 | Imm11};
-}
-
-/// Decode 25-bit immediate value for branch instructions with J1J2 range
-/// extension (formats B T4, BL T1 and BLX T2).
-///
-///   [ 00000:S:Imm10, 00:J1:0:J2:Imm11] -> S:I1:I2:Imm10:Imm11:0
-///
-int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo) {
-  uint32_t S = Hi & 0x0400;
-  uint32_t I1 = ~((Lo ^ (Hi << 3)) << 10) & 0x00800000;
-  uint32_t I2 = ~((Lo ^ (Hi << 1)) << 11) & 0x00400000;
-  uint32_t Imm10 = Hi & 0x03ff;
-  uint32_t Imm11 = Lo & 0x07ff;
-  return SignExtend64<25>(S << 14 | I1 | I2 | Imm10 << 12 | Imm11 << 1);
-}
-
-/// Encode 16-bit immediate value for move instruction formats MOVT T1 and
-/// MOVW T3.
-///
-///   Imm4:Imm1:Imm3:Imm8 -> [ 00000:i:000000:Imm4, 0:Imm3:0000:Imm8 ]
-///
-HalfWords encodeImmMovtT1MovwT3(uint16_t Value) {
-  uint32_t Imm4 = (Value >> 12) & 0x0f;
-  uint32_t Imm1 = (Value >> 11) & 0x01;
-  uint32_t Imm3 = (Value >> 8) & 0x07;
-  uint32_t Imm8 = Value & 0xff;
-  return HalfWords{Imm1 << 10 | Imm4, Imm3 << 12 | Imm8};
-}
-
-/// Decode 16-bit immediate value from move instruction formats MOVT T1 and
-/// MOVW T3.
-///
-///   [ 00000:i:000000:Imm4, 0:Imm3:0000:Imm8 ] -> Imm4:Imm1:Imm3:Imm8
-///
-uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo) {
-  uint32_t Imm4 = Hi & 0x0f;
-  uint32_t Imm1 = (Hi >> 10) & 0x01;
-  uint32_t Imm3 = (Lo >> 12) & 0x07;
-  uint32_t Imm8 = Lo & 0xff;
-  uint32_t Imm16 = Imm4 << 12 | Imm1 << 11 | Imm3 << 8 | Imm8;
-  assert(Imm16 <= 0xffff && "Decoded value out-of-range");
-  return Imm16;
-}
-
-/// Encode register ID for instruction formats MOVT T1 and MOVW T3.
-///
-///   Rd4 -> [0000000000000000, 0000:Rd4:00000000]
-///
-HalfWords encodeRegMovtT1MovwT3(int64_t Value) {
-  uint32_t Rd4 = (Value & 0x0f) << 8;
-  return HalfWords{0, Rd4};
-}
-
-/// Decode register ID from instruction formats MOVT T1 and MOVW T3.
-///
-///   [0000000000000000, 0000:Rd4:00000000] -> Rd4
-///
-int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo) {
-  uint32_t Rd4 = (Lo >> 8) & 0x0f;
-  return Rd4;
-}
-
-/// 32-bit Thumb instructions are stored as two little-endian halfwords.
-/// An instruction at address A encodes bytes A+1, A in the first halfword (Hi),
-/// followed by bytes A+3, A+2 in the second halfword (Lo).
-struct WritableThumbRelocation {
-  /// Create a writable reference to a Thumb32 fixup.
-  WritableThumbRelocation(char *FixupPtr)
-      : Hi{*reinterpret_cast<support::ulittle16_t *>(FixupPtr)},
-        Lo{*reinterpret_cast<support::ulittle16_t *>(FixupPtr + 2)} {}
-
-  support::ulittle16_t &Hi; // First halfword
-  support::ulittle16_t &Lo; // Second halfword
-};
-
-struct ThumbRelocation {
-  /// Create a read-only reference to a Thumb32 fixup.
-  ThumbRelocation(const char *FixupPtr)
-      : Hi{*reinterpret_cast<const support::ulittle16_t *>(FixupPtr)},
-        Lo{*reinterpret_cast<const support::ulittle16_t *>(FixupPtr + 2)} {}
-
-  /// Create a read-only Thumb32 fixup from a writeable one.
-  ThumbRelocation(WritableThumbRelocation &Writable)
-      : Hi{Writable.Hi}, Lo(Writable.Lo) {}
-
-  const support::ulittle16_t &Hi; // First halfword
-  const support::ulittle16_t &Lo; // Second halfword
-};
-
-Error makeUnexpectedOpcodeError(const LinkGraph &G, const ThumbRelocation &R,
-                                Edge::Kind Kind) {
-  return make_error<JITLinkError>(
-      formatv("Invalid opcode [ 0x{0:x4}, 0x{1:x4} ] for relocation: {2}", R.Hi,
-              R.Lo, G.getEdgeKindName(Kind)));
-}
-
-template <EdgeKind_aarch32 Kind> bool checkOpcode(const ThumbRelocation &R) {
-  uint16_t Hi = R.Hi & FixupInfo<Kind>::OpcodeMask.Hi;
-  uint16_t Lo = R.Lo & FixupInfo<Kind>::OpcodeMask.Lo;
-  return Hi == FixupInfo<Kind>::Opcode.Hi && Lo == FixupInfo<Kind>::Opcode.Lo;
-}
-
-template <EdgeKind_aarch32 Kind>
-bool checkRegister(const ThumbRelocation &R, HalfWords Reg) {
-  uint16_t Hi = R.Hi & FixupInfo<Kind>::RegMask.Hi;
-  uint16_t Lo = R.Lo & FixupInfo<Kind>::RegMask.Lo;
-  return Hi == Reg.Hi && Lo == Reg.Lo;
-}
-
-template <EdgeKind_aarch32 Kind>
-bool writeRegister(WritableThumbRelocation &R, HalfWords Reg) {
-  static constexpr HalfWords Mask = FixupInfo<Kind>::RegMask;
-  assert((Mask.Hi & Reg.Hi) == Reg.Hi && (Mask.Hi & Reg.Hi) == Reg.Hi &&
-         "Value bits exceed bit range of given mask");
-  R.Hi = (R.Hi & ~Mask.Hi) | Reg.Hi;
-  R.Lo = (R.Lo & ~Mask.Lo) | Reg.Lo;
-}
-
-template <EdgeKind_aarch32 Kind>
-void writeImmediate(WritableThumbRelocation &R, HalfWords Imm) {
-  static constexpr HalfWords Mask = FixupInfo<Kind>::ImmMask;
-  assert((Mask.Hi & Imm.Hi) == Imm.Hi && (Mask.Hi & Imm.Hi) == Imm.Hi &&
-         "Value bits exceed bit range of given mask");
-  R.Hi = (R.Hi & ~Mask.Hi) | Imm.Hi;
-  R.Lo = (R.Lo & ~Mask.Lo) | Imm.Lo;
-}
-
-Expected<int64_t> readAddendData(LinkGraph &G, Block &B, const Edge &E) {
-  endianness Endian = G.getEndianness();
-  assert(Endian != native && "Declare as little or big explicitly");
-
-  Edge::Kind Kind = E.getKind();
-  const char *BlockWorkingMem = B.getContent().data();
-  const char *FixupPtr = BlockWorkingMem + E.getOffset();
-
-  switch (Kind) {
-  case Data_Delta32:
-    return SignExtend64<32>((Endian == little) ? read32<little>(FixupPtr)
-                                               : read32<big>(FixupPtr));
-  default:
-    return make_error<JITLinkError>(
-        "In graph " + G.getName() + ", section " + B.getSection().getName() +
-        " can not read implicit addend for aarch32 edge kind " +
-        G.getEdgeKindName(E.getKind()));
-  }
-}
-
-Expected<int64_t> readAddendArm(LinkGraph &G, Block &B, const Edge &E) {
-  Edge::Kind Kind = E.getKind();
-
-  switch (Kind) {
-  case Arm_Call:
-    return make_error<JITLinkError>(
-        "Addend extraction for relocation type not yet implemented: " +
-        StringRef(G.getEdgeKindName(Kind)));
-  default:
-    return make_error<JITLinkError>(
-        "In graph " + G.getName() + ", section " + B.getSection().getName() +
-        " can not read implicit addend for aarch32 edge kind " +
-        G.getEdgeKindName(E.getKind()));
-  }
-}
-
-Expected<int64_t> readAddendThumb(LinkGraph &G, Block &B, const Edge &E,
-                                  const ArmConfig &ArmCfg) {
-  ThumbRelocation R(B.getContent().data() + E.getOffset());
-  Edge::Kind Kind = E.getKind();
-
-  switch (Kind) {
-  case Thumb_Call:
-    if (!checkOpcode<Thumb_Call>(R))
-      return makeUnexpectedOpcodeError(G, R, Kind);
-    return LLVM_LIKELY(ArmCfg.J1J2BranchEncoding)
-               ? decodeImmBT4BlT1BlxT2_J1J2(R.Hi, R.Lo)
-               : decodeImmBT4BlT1BlxT2(R.Hi, R.Lo);
-
-  case Thumb_Jump24:
-    if (!checkOpcode<Thumb_Jump24>(R))
-      return makeUnexpectedOpcodeError(G, R, Kind);
-    if (R.Lo & FixupInfo<Thumb_Jump24>::LoBitConditional)
-      return make_error<JITLinkError>("Relocation expects an unconditional "
-                                      "B.W branch instruction: " +
-                                      StringRef(G.getEdgeKindName(Kind)));
-    return LLVM_LIKELY(ArmCfg.J1J2BranchEncoding)
-                  ? decodeImmBT4BlT1BlxT2_J1J2(R.Hi, R.Lo)
-                  : decodeImmBT4BlT1BlxT2(R.Hi, R.Lo);
-
-  case Thumb_MovwAbsNC:
-    if (!checkOpcode<Thumb_MovwAbsNC>(R))
-      return makeUnexpectedOpcodeError(G, R, Kind);
-    // Initial addend is interpreted as a signed value
-    return SignExtend64<16>(decodeImmMovtT1MovwT3(R.Hi, R.Lo));
-
-  case Thumb_MovtAbs:
-    if (!checkOpcode<Thumb_MovtAbs>(R))
-      return makeUnexpectedOpcodeError(G, R, Kind);
-    // Initial addend is interpreted as a signed value
-    return SignExtend64<16>(decodeImmMovtT1MovwT3(R.Hi, R.Lo));
-
-  default:
-    return make_error<JITLinkError>(
-        "In graph " + G.getName() + ", section " + B.getSection().getName() +
-        " can not read implicit addend for aarch32 edge kind " +
-        G.getEdgeKindName(E.getKind()));
-  }
-}
-
-Error applyFixupData(LinkGraph &G, Block &B, const Edge &E) {
-  using namespace support;
-
-  char *BlockWorkingMem = B.getAlreadyMutableContent().data();
-  char *FixupPtr = BlockWorkingMem + E.getOffset();
-
-  auto Write32 = [FixupPtr, Endian = G.getEndianness()](int64_t Value) {
-    assert(Endian != native && "Must be explicit: little or big");
-    assert(isInt<32>(Value) && "Must be in signed 32-bit range");
-    uint32_t Imm = static_cast<int32_t>(Value);
-    if (LLVM_LIKELY(Endian == little))
-      endian::write32<little>(FixupPtr, Imm);
-    else
-      endian::write32<big>(FixupPtr, Imm);
-  };
-
-  Edge::Kind Kind = E.getKind();
-  uint64_t FixupAddress = (B.getAddress() + E.getOffset()).getValue();
-  int64_t Addend = E.getAddend();
-  Symbol &TargetSymbol = E.getTarget();
-  uint64_t TargetAddress = TargetSymbol.getAddress().getValue();
-  assert(!TargetSymbol.hasTargetFlags(ThumbSymbol));
-
-  // Regular data relocations have size 4, alignment 1 and write the full 32-bit
-  // result to the place; no need for overflow checking. There are three
-  // exceptions: R_ARM_ABS8, R_ARM_ABS16, R_ARM_PREL31
-  switch (Kind) {
-  case Data_Delta32: {
-    int64_t Value = TargetAddress - FixupAddress + Addend;
-    if (!isInt<32>(Value))
-      return makeTargetOutOfRangeError(G, B, E);
-    Write32(Value);
-    return Error::success();
-  }
-  default:
-    return make_error<JITLinkError>(
-        "In graph " + G.getName() + ", section " + B.getSection().getName() +
-        " encountered unfixable aarch32 edge kind " +
-        G.getEdgeKindName(E.getKind()));
-  }
-}
-
-Error applyFixupArm(LinkGraph &G, Block &B, const Edge &E) {
-  Edge::Kind Kind = E.getKind();
-
-  switch (Kind) {
-  case Arm_Call:
-    return make_error<JITLinkError>(
-        "Fix-up for relocation type not yet implemented: " +
-        StringRef(G.getEdgeKindName(Kind)));
-  default:
-    return make_error<JITLinkError>(
-        "In graph " + G.getName() + ", section " + B.getSection().getName() +
-        " encountered unfixable aarch32 edge kind " +
-        G.getEdgeKindName(E.getKind()));
-  }
-}
-
-Error applyFixupThumb(LinkGraph &G, Block &B, const Edge &E,
-                      const ArmConfig &ArmCfg) {
-  WritableThumbRelocation R(B.getAlreadyMutableContent().data() +
-                            E.getOffset());
-
-  Edge::Kind Kind = E.getKind();
-  uint64_t FixupAddress = (B.getAddress() + E.getOffset()).getValue();
-  int64_t Addend = E.getAddend();
-  Symbol &TargetSymbol = E.getTarget();
-  uint64_t TargetAddress = TargetSymbol.getAddress().getValue();
-  if (TargetSymbol.hasTargetFlags(ThumbSymbol))
-    TargetAddress |= 0x01;
-
-  switch (Kind) {
-  case Thumb_Jump24: {
-    if (!checkOpcode<Thumb_Jump24>(R))
-      return makeUnexpectedOpcodeError(G, R, Kind);
-    if (R.Lo & FixupInfo<Thumb_Jump24>::LoBitConditional)
-      return make_error<JITLinkError>("Relocation expects an unconditional "
-                                      "B.W branch instruction: " +
-                                      StringRef(G.getEdgeKindName(Kind)));
-    if (!(TargetSymbol.hasTargetFlags(ThumbSymbol)))
-      return make_error<JITLinkError>("Branch relocation needs interworking "
-                                      "stub when bridging to ARM: " +
-                                      StringRef(G.getEdgeKindName(Kind)));
-
-    int64_t Value = TargetAddress - FixupAddress + Addend;
-    if (LLVM_LIKELY(ArmCfg.J1J2BranchEncoding)) {
-      if (!isInt<25>(Value))
-        return makeTargetOutOfRangeError(G, B, E);
-      writeImmediate<Thumb_Jump24>(R, encodeImmBT4BlT1BlxT2_J1J2(Value));
-    } else {
-      if (!isInt<22>(Value))
-        return makeTargetOutOfRangeError(G, B, E);
-      writeImmediate<Thumb_Jump24>(R, encodeImmBT4BlT1BlxT2(Value));
-    }
-
-    return Error::success();
-  }
-
-  case Thumb_Call: {
-    if (!checkOpcode<Thumb_Call>(R))
-      return makeUnexpectedOpcodeError(G, R, Kind);
-
-    int64_t Value = TargetAddress - FixupAddress + Addend;
-
-    // The call instruction itself is Thumb. The call destination can either be
-    // Thumb or Arm. We use BL to stay in Thumb and BLX to change to Arm.
-    bool TargetIsArm = !TargetSymbol.hasTargetFlags(ThumbSymbol);
-    bool InstrIsBlx = (R.Lo & FixupInfo<Thumb_Call>::LoBitNoBlx) == 0;
-    if (TargetIsArm != InstrIsBlx) {
-      if (LLVM_LIKELY(TargetIsArm)) {
-        // Change opcode BL -> BLX and fix range value (account for 4-byte
-        // aligned destination while instruction may only be 2-byte aligned
-        // and clear Thumb bit).
-        R.Lo = R.Lo & ~FixupInfo<Thumb_Call>::LoBitNoBlx;
-        R.Lo = R.Lo & ~FixupInfo<Thumb_Call>::LoBitH;
-        Value = alignTo(Value, 4);
-      } else {
-        // Change opcode BLX -> BL and set Thumb bit
-        R.Lo = R.Lo & ~FixupInfo<Thumb_Call>::LoBitNoBlx;
-        Value |= 0x01;
-      }
-    }
-
-    if (LLVM_LIKELY(ArmCfg.J1J2BranchEncoding)) {
-      if (!isInt<25>(Value))
-        return makeTargetOutOfRangeError(G, B, E);
-      writeImmediate<Thumb_Call>(R, encodeImmBT4BlT1BlxT2_J1J2(Value));
-    } else {
-      if (!isInt<22>(Value))
-        return makeTargetOutOfRangeError(G, B, E);
-      writeImmediate<Thumb_Call>(R, encodeImmBT4BlT1BlxT2(Value));
-    }
-
-    assert(((R.Lo & FixupInfo<Thumb_Call>::LoBitNoBlx) ||
-            (R.Lo & FixupInfo<Thumb_Call>::LoBitH) == 0) &&
-           "Opcode BLX implies H bit is clear (avoid UB in BLX T2)");
-    return Error::success();
-  }
-
-  case Thumb_MovwAbsNC: {
-    if (!checkOpcode<Thumb_MovwAbsNC>(R))
-      return makeUnexpectedOpcodeError(G, R, Kind);
-    uint16_t Value = (TargetAddress + Addend) & 0xffff;
-    writeImmediate<Thumb_MovwAbsNC>(R, encodeImmMovtT1MovwT3(Value));
-    return Error::success();
-  }
-
-  case Thumb_MovtAbs: {
-    if (!checkOpcode<Thumb_MovtAbs>(R))
-      return makeUnexpectedOpcodeError(G, R, Kind);
-    uint16_t Value = ((TargetAddress + Addend) >> 16) & 0xffff;
-    writeImmediate<Thumb_MovtAbs>(R, encodeImmMovtT1MovwT3(Value));
-    return Error::success();
-  }
-
-  default:
-    return make_error<JITLinkError>(
-        "In graph " + G.getName() + ", section " + B.getSection().getName() +
-        " encountered unfixable aarch32 edge kind " +
-        G.getEdgeKindName(E.getKind()));
-  }
-}
-
-const uint8_t Thumbv7ABS[] = {
-    0x40, 0xf2, 0x00, 0x0c, // movw r12, #0x0000    ; lower 16-bit
-    0xc0, 0xf2, 0x00, 0x0c, // movt r12, #0x0000    ; upper 16-bit
-    0x60, 0x47              // bx   r12
-};
-
-template <>
-Symbol &StubsManager<Thumbv7>::createEntry(LinkGraph &G, Symbol &Target) {
-  constexpr uint64_t Alignment = 4;
-  Block &B = addStub(G, Thumbv7ABS, Alignment);
-  LLVM_DEBUG({
-    const char *StubPtr = B.getContent().data();
-    HalfWords Reg12 = encodeRegMovtT1MovwT3(12);
-    assert(checkRegister<Thumb_MovwAbsNC>(StubPtr, Reg12) &&
-           checkRegister<Thumb_MovtAbs>(StubPtr + 4, Reg12) &&
-           "Linker generated stubs may only corrupt register r12 (IP)");
-  });
-  B.addEdge(Thumb_MovwAbsNC, 0, Target, 0);
-  B.addEdge(Thumb_MovtAbs, 4, Target, 0);
-  Symbol &Stub = G.addAnonymousSymbol(B, 0, B.getSize(), true, false);
-  Stub.setTargetFlags(ThumbSymbol);
-  return Stub;
-}
-
-const char *getEdgeKindName(Edge::Kind K) {
-#define KIND_NAME_CASE(K)                                                      \
-  case K:                                                                      \
-    return #K;
-
-  switch (K) {
-    KIND_NAME_CASE(Data_Delta32)
-    KIND_NAME_CASE(Arm_Call)
-    KIND_NAME_CASE(Thumb_Call)
-    KIND_NAME_CASE(Thumb_Jump24)
-    KIND_NAME_CASE(Thumb_MovwAbsNC)
-    KIND_NAME_CASE(Thumb_MovtAbs)
-  default:
-    return getGenericEdgeKindName(K);
-  }
-#undef KIND_NAME_CASE
-}
-
-const char *getCPUArchName(ARMBuildAttrs::CPUArch K) {
-#define CPUARCH_NAME_CASE(K)                                                   \
-  case K:                                                                      \
-    return #K;
-
-  using namespace ARMBuildAttrs;
-  switch (K) {
-    CPUARCH_NAME_CASE(Pre_v4)
-    CPUARCH_NAME_CASE(v4)
-    CPUARCH_NAME_CASE(v4T)
-    CPUARCH_NAME_CASE(v5T)
-    CPUARCH_NAME_CASE(v5TE)
-    CPUARCH_NAME_CASE(v5TEJ)
-    CPUARCH_NAME_CASE(v6)
-    CPUARCH_NAME_CASE(v6KZ)
-    CPUARCH_NAME_CASE(v6T2)
-    CPUARCH_NAME_CASE(v6K)
-    CPUARCH_NAME_CASE(v7)
-    CPUARCH_NAME_CASE(v6_M)
-    CPUARCH_NAME_CASE(v6S_M)
-    CPUARCH_NAME_CASE(v7E_M)
-    CPUARCH_NAME_CASE(v8_A)
-    CPUARCH_NAME_CASE(v8_R)
-    CPUARCH_NAME_CASE(v8_M_Base)
-    CPUARCH_NAME_CASE(v8_M_Main)
-    CPUARCH_NAME_CASE(v8_1_M_Main)
-    CPUARCH_NAME_CASE(v9_A)
-  }
-  llvm_unreachable("Missing CPUArch in switch?");
-#undef CPUARCH_NAME_CASE
-}
-
-} // namespace aarch32
-} // namespace jitlink
-} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 83a09b8d41e91..2c270cd66285d 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
-#include "llvm/ExecutionEngine/JITLink/aarch32.h"
 #include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h"
 #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h"
@@ -41,10 +40,7 @@ bool hasInitializerSection(jitlink::LinkGraph &G) {
 }
 
 JITTargetAddress getJITSymbolPtrForSymbol(Symbol &Sym) {
-  uint64_t CallableAddr = Sym.getAddress().getValue();
-  if (Sym.isCallable() && Sym.hasTargetFlags(aarch32::ThumbSymbol))
-    CallableAddr |= 0x01; // thumb bit
-  return CallableAddr;
+  return Sym.getAddress().getValue();
 }
 
 JITSymbolFlags getJITSymbolFlagsForSymbol(Symbol &Sym) {
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_thumbv7_printf.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_thumbv7_printf.s
deleted file mode 100644
index 11a77c95cfa8f..0000000000000
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_thumbv7_printf.s
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: llvm-mc -triple=thumbv7-none-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t.o %s
-// RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb -slab-page-size 4096 -abs printf=0x76bbe880 -show-entry-es %t.o | FileCheck %s
-
-// Check that main is a thumb symbol (with LSB set) and printf is arm (with LSB clear)
-//
-// CHECK-LABEL: Symbol table:
-// CHECK-NEXT:    "main":   0x{{[0-9a-f]+[13579bdf]}} [Callable] Ready
-// CHECK-NEXT:    "printf": 0x76bbe880 [Data] Ready
-
-	.globl	main
-	.p2align	2
-	.type	main,%function
-	.code	16
-	.thumb_func
-main:
-	.fnstart
-	.save	{r7, lr}
-	push	{r7, lr}
-	.setfp	r7, sp
-	mov	r7, sp
-	.pad	#8
-	sub	sp, #8
-	movs	r0, #0
-	str	r0, [sp]
-	str	r0, [sp, #4]
-	ldr	r0, .LCPI0_0
-.LPC0_0:
-	add	r0, pc
-	bl	printf
-	ldr	r0, [sp]
-	add	sp, #8
-	pop	{r7, pc}
-
-	.p2align	2
-.LCPI0_0:
-	.long	.L.str-(.LPC0_0+4)
-
-	.size	main, .-main
-	.cantunwind
-	.fnend
-
-	.type	.L.str,%object
-	.section	.rodata.str1.1,"aMS",%progbits,1
-.L.str:
-	.asciz	"Hello AArch32!\n"
-	.size	.L.str, 12
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/lit.local.cfg b/llvm/test/ExecutionEngine/JITLink/AArch32/lit.local.cfg
deleted file mode 100644
index 20e19aeb06f9d..0000000000000
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if not 'ARM' in config.root.targets:
-  config.unsupported = True
diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
deleted file mode 100644
index 0e41174040b68..0000000000000
--- a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-//===------- AArch32Tests.cpp - Unit tests for the AArch32 backend --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <llvm/BinaryFormat/ELF.h>
-#include <llvm/ExecutionEngine/JITLink/aarch32.h>
-
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace llvm::jitlink;
-using namespace llvm::jitlink::aarch32;
-using namespace llvm::support;
-using namespace llvm::support::endian;
-
-struct MutableHalfWords {
-  MutableHalfWords(HalfWords Preset) : Hi(Preset.Hi), Lo(Preset.Lo) {}
-
-  void patch(HalfWords Value, HalfWords Mask) {
-    Hi = (Hi & ~Mask.Hi) | Value.Hi;
-    Lo = (Lo & ~Mask.Lo) | Value.Lo;
-  }
-
-  uint16_t Hi; // First halfword
-  uint16_t Lo; // Second halfword
-};
-
-namespace llvm {
-namespace jitlink {
-
-Expected<aarch32::EdgeKind_aarch32> getJITLinkEdgeKind(uint32_t ELFType);
-Expected<uint32_t> getELFRelocationType(Edge::Kind Kind);
-
-} // namespace jitlink
-} // namespace llvm
-
-TEST(AArch32_ELF, EdgeKinds) {
-  // Fails: Invalid ELF type -> JITLink kind
-  Expected<uint32_t> ErrKind = getJITLinkEdgeKind(ELF::R_ARM_NONE);
-  EXPECT_TRUE(errorToBool(ErrKind.takeError()));
-
-  // Fails: Invalid JITLink kind -> ELF type
-  Expected<uint32_t> ErrType = getELFRelocationType(Edge::Invalid);
-  EXPECT_TRUE(errorToBool(ErrType.takeError()));
-
-  for (Edge::Kind K = FirstDataRelocation; K < LastThumbRelocation; K += 1) {
-    Expected<uint32_t> ELFType = getELFRelocationType(K);
-    EXPECT_FALSE(errorToBool(ELFType.takeError()))
-        << "Failed to translate JITLink kind -> ELF type";
-
-    Expected<Edge::Kind> JITLinkKind = getJITLinkEdgeKind(*ELFType);
-    EXPECT_FALSE(errorToBool(JITLinkKind.takeError()))
-        << "Failed to translate ELF type -> JITLink kind";
-
-    EXPECT_EQ(*JITLinkKind, K) << "Round-trip value inconsistent?";
-  }
-}
-
-namespace llvm {
-namespace jitlink {
-namespace aarch32 {
-
-HalfWords encodeImmBT4BlT1BlxT2(int64_t Value);
-HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value);
-HalfWords encodeImmMovtT1MovwT3(uint16_t Value);
-HalfWords encodeRegMovtT1MovwT3(int64_t Value);
-
-int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo);
-int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo);
-uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
-int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
-
-} // namespace aarch32
-} // namespace jitlink
-} // namespace llvm
-
-// Big-endian for v7 and v8 (and v6 unless in legacy backwards compatible mode
-// be32) have little-endian instructions and big-endian data. In ELF relocatable
-// objects big-endian instructions may still be encountered. A be8 supporting
-// linker is expected to endian-reverse instructions for the executable.
-template <endianness Endian>
-static HalfWords makeHalfWords(std::array<uint8_t, 4> Mem) {
-  return HalfWords{read16<Endian>(Mem.data()), read16<Endian>(Mem.data() + 2)};
-}
-
-/// 25-bit branch with link (with J1J2 range extension)
-TEST(AArch32_Relocations, Thumb_Call_J1J2) {
-  static_assert(isInt<25>(16777215), "Max value");
-  static_assert(isInt<25>(-16777215), "Min value");
-  static_assert(!isInt<25>(16777217), "First overflow");
-  static_assert(!isInt<25>(-16777217), "First underflow");
-
-  constexpr HalfWords ImmMask = FixupInfo<Thumb_Call>::ImmMask;
-
-  static std::array<HalfWords, 3> MemPresets{
-      makeHalfWords<little>({0xff, 0xf7, 0xfe, 0xef}), // common
-      makeHalfWords<little>({0x00, 0x00, 0x00, 0x00}), // zeros
-      makeHalfWords<little>({0xff, 0xff, 0xff, 0xff}), // ones
-  };
-
-  auto EncodeDecode = [ImmMask](int64_t In, MutableHalfWords &Mem) {
-    Mem.patch(encodeImmBT4BlT1BlxT2_J1J2(In), ImmMask);
-    return decodeImmBT4BlT1BlxT2_J1J2(Mem.Hi, Mem.Lo);
-  };
-
-  for (MutableHalfWords Mem : MemPresets) {
-    HalfWords UnaffectedBits(Mem.Hi & ~ImmMask.Hi, Mem.Lo & ~ImmMask.Lo);
-
-    EXPECT_EQ(EncodeDecode(1, Mem), 0);                 // Zero value
-    EXPECT_EQ(EncodeDecode(0x41, Mem), 0x40);           // Common value
-    EXPECT_EQ(EncodeDecode(16777215, Mem), 16777214);   // Maximum value
-    EXPECT_EQ(EncodeDecode(-16777215, Mem), -16777216); // Minimum value
-    EXPECT_NE(EncodeDecode(16777217, Mem), 16777217);   // First overflow
-    EXPECT_NE(EncodeDecode(-16777217, Mem), -16777217); // First underflow
-
-    EXPECT_TRUE(UnaffectedBits.Hi == (Mem.Hi & ~ImmMask.Hi) &&
-                UnaffectedBits.Lo == (Mem.Lo & ~ImmMask.Lo))
-        << "Diff outside immediate field";
-  }
-}
-
-/// 22-bit branch with link (without J1J2 range extension)
-TEST(AArch32_Relocations, Thumb_Call_Bare) {
-  static_assert(isInt<22>(2097151), "Max value");
-  static_assert(isInt<22>(-2097151), "Min value");
-  static_assert(!isInt<22>(2097153), "First overflow");
-  static_assert(!isInt<22>(-2097153), "First underflow");
-
-  constexpr HalfWords ImmMask = FixupInfo<Thumb_Call>::ImmMask;
-
-  static std::array<HalfWords, 3> MemPresets{
-      makeHalfWords<little>({0xff, 0xf7, 0xfe, 0xef}), // common
-      makeHalfWords<little>({0x00, 0x00, 0x00, 0x00}), // zeros
-      makeHalfWords<little>({0xff, 0xff, 0xff, 0xff}), // ones
-  };
-
-  auto EncodeDecode = [ImmMask](int64_t In, MutableHalfWords &Mem) {
-    Mem.patch(encodeImmBT4BlT1BlxT2_J1J2(In), ImmMask);
-    return decodeImmBT4BlT1BlxT2_J1J2(Mem.Hi, Mem.Lo);
-  };
-
-  for (MutableHalfWords Mem : MemPresets) {
-    HalfWords UnaffectedBits(Mem.Hi & ~ImmMask.Hi, Mem.Lo & ~ImmMask.Lo);
-
-    EXPECT_EQ(EncodeDecode(1, Mem), 0);               // Zero value
-    EXPECT_EQ(EncodeDecode(0x41, Mem), 0x40);         // Common value
-    EXPECT_EQ(EncodeDecode(2097151, Mem), 2097150);   // Maximum value
-    EXPECT_EQ(EncodeDecode(-2097151, Mem), -2097152); // Minimum value
-    EXPECT_NE(EncodeDecode(2097153, Mem), 2097153);   // First overflow
-    EXPECT_NE(EncodeDecode(-2097153, Mem), -2097153); // First underflow
-
-    EXPECT_TRUE(UnaffectedBits.Hi == (Mem.Hi & ~ImmMask.Hi) &&
-                UnaffectedBits.Lo == (Mem.Lo & ~ImmMask.Lo))
-        << "Diff outside immediate field";
-  }
-}
-
-/// Write immediate value to the top halfword of the destination register
-TEST(AArch32_Relocations, Thumb_MovtAbs) {
-  static_assert(isUInt<16>(65535), "Max value");
-  static_assert(!isUInt<16>(65536), "First overflow");
-
-  constexpr HalfWords ImmMask = FixupInfo<Thumb_MovtAbs>::ImmMask;
-  constexpr HalfWords RegMask = FixupInfo<Thumb_MovtAbs>::RegMask;
-
-  static std::array<uint8_t, 3> Registers{0, 5, 12};
-  static std::array<HalfWords, 3> MemPresets{
-      makeHalfWords<little>({0xff, 0xf7, 0xfe, 0xef}), // common
-      makeHalfWords<little>({0x00, 0x00, 0x00, 0x00}), // zeros
-      makeHalfWords<little>({0xff, 0xff, 0xff, 0xff}), // ones
-  };
-
-  auto EncodeDecode = [ImmMask](uint32_t In, MutableHalfWords &Mem) {
-    Mem.patch(encodeImmMovtT1MovwT3(In), ImmMask);
-    return decodeImmMovtT1MovwT3(Mem.Hi, Mem.Lo);
-  };
-
-  for (MutableHalfWords Mem : MemPresets) {
-    for (uint8_t Reg : Registers) {
-      HalfWords UnaffectedBits(Mem.Hi & ~(ImmMask.Hi | RegMask.Hi),
-                               Mem.Lo & ~(ImmMask.Lo | RegMask.Lo));
-
-      Mem.patch(encodeRegMovtT1MovwT3(Reg), RegMask);
-      EXPECT_EQ(EncodeDecode(0x76bb, Mem), 0x76bb);   // Common value
-      EXPECT_EQ(EncodeDecode(0, Mem), 0);             // Minimum value
-      EXPECT_EQ(EncodeDecode(0xffff, Mem), 0xffff);   // Maximum value
-      EXPECT_NE(EncodeDecode(0x10000, Mem), 0x10000); // First overflow
-
-      // Destination register as well as unaffacted bits should be intact
-      EXPECT_EQ(decodeRegMovtT1MovwT3(Mem.Hi, Mem.Lo), Reg);
-      EXPECT_TRUE(UnaffectedBits.Hi == (Mem.Hi & ~(ImmMask.Hi | RegMask.Hi)) &&
-                  UnaffectedBits.Lo == (Mem.Lo & ~(ImmMask.Lo | RegMask.Lo)))
-          << "Diff outside immediate/register field";
-    }
-  }
-}
diff --git a/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
index 978914c748c63..1a71a62d3756d 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
@@ -8,7 +8,6 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(JITLinkTests
-    AArch32Tests.cpp
     EHFrameSupportTests.cpp
     LinkGraphTests.cpp
   )

From d557384b43d32700ed09b08564a4f7823061d999 Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <ayermolo@meta.com>
Date: Thu, 23 Mar 2023 13:20:38 -0700
Subject: [PATCH 173/208] [LLDB] Fix for D139955 Summary:

Fixing a small typo.

Reviewed By: clayborg

Differential Revision: https://reviews.llvm.org/D146659
---
 .../SymbolFile/DWARF/SymbolFileDWARF.cpp      |   2 +-
 .../DWARF/range-lower-then-low-pc.s           | 317 ++++++++++++++++++
 2 files changed, 318 insertions(+), 1 deletion(-)
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/range-lower-then-low-pc.s

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 99a0152eaf6e6..c6873a5b7a09a 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -1319,7 +1319,7 @@ size_t SymbolFileDWARF::ParseBlocksRecursive(
                                          range.GetByteSize()));
           else {
             GetObjectFile()->GetModule()->ReportError(
-                "{0x:+8}: adding range [{1:x16}-{2:x16}) which has a base "
+                "{0:x8}: adding range [{1:x16}-{2:x16}) which has a base "
                 "that is less than the function's low PC {3:x16}. Please file "
                 "a bug and attach the file at the "
                 "start of this error message",
diff --git a/lldb/test/Shell/SymbolFile/DWARF/range-lower-then-low-pc.s b/lldb/test/Shell/SymbolFile/DWARF/range-lower-then-low-pc.s
new file mode 100644
index 0000000000000..e3cc84db12652
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/range-lower-then-low-pc.s
@@ -0,0 +1,317 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
+# RUN: lldb-test symbols %t &> %t.txt
+# RUN: cat %t.txt | FileCheck %s
+
+# Tests that error is printed correctly when DW_AT_low_pc value is
+# greater then a range entry.
+
+# CHECK: 0x0000006e: adding range [0x0000000000000000-0x000000000000001f)
+# CHECK-SAME: which has a base that is less than the function's low PC 0x0000000000000021.
+# CHECK-SAME: Please file a bug and attach the file at the start of this error message
+
+
+
+# Test was manually modified to change DW_TAG_lexical_block
+# to use DW_AT_ranges, and value lower then DW_AT_low_pc value
+# in DW_TAG_subprogram
+# static int foo(bool b) {
+#   if (b) {
+#    int food = 1;
+#     return food;
+#   }
+#   return 0;
+# }
+# int main() {
+#   return foo(true);
+# }
+	.text
+	.file	"main.cpp"
+	.section	.text.main,"ax",@progbits
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	1 "base-lower-then-range-entry" "main.cpp"
+	.loc	1 8 0                           # main.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	subq	$16, %rsp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	1 9 10 prologue_end             # main.cpp:9:10
+	movl	$1, %edi
+	callq	_ZL3foob
+	.loc	1 9 3 epilogue_begin is_stmt 0  # main.cpp:9:3
+	addq	$16, %rsp
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.text._ZL3foob,"ax",@progbits
+	.p2align	4, 0x90                         # -- Begin function _ZL3foob
+	.type	_ZL3foob,@function
+_ZL3foob:                               # @_ZL3foob
+.Lfunc_begin1:
+	.loc	1 1 0 is_stmt 1                 # main.cpp:1:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movb	%dil, %al
+	andb	$1, %al
+	movb	%al, -5(%rbp)
+.Ltmp2:
+	.loc	1 2 7 prologue_end              # main.cpp:2:7
+	testb	$1, -5(%rbp)
+	je	.LBB1_2
+# %bb.1:                                # %if.then
+.Ltmp3:
+	.loc	1 3 8                           # main.cpp:3:8
+	movl	$1, -12(%rbp)
+	.loc	1 4 12                          # main.cpp:4:12
+	movl	-12(%rbp), %eax
+	.loc	1 4 5 is_stmt 0                 # main.cpp:4:5
+	movl	%eax, -4(%rbp)
+	jmp	.LBB1_3
+.Ltmp4:
+.LBB1_2:                                # %if.end
+	.loc	1 6 3 is_stmt 1                 # main.cpp:6:3
+	movl	$0, -4(%rbp)
+.LBB1_3:                                # %return
+	.loc	1 7 1                           # main.cpp:7:1
+	movl	-4(%rbp), %eax
+	.loc	1 7 1 epilogue_begin is_stmt 0  # main.cpp:7:1
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp5:
+.Lfunc_end1:
+	.size	_ZL3foob, .Lfunc_end1-_ZL3foob
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	14                              # DW_FORM_strp
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	85                              # DW_AT_ranges
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	11                              # DW_TAG_lexical_block
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	85                              # DW_AT_ranges   <------ Manually modified. Replaced low_pc/high)_pc with rangres.
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x8f DW_TAG_compile_unit
+	.long	.Linfo_string0                  # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.long	.Linfo_string1                  # DW_AT_name
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Linfo_string2                  # DW_AT_comp_dir
+	.quad	0                               # DW_AT_low_pc
+	.long	.Ldebug_ranges0                 # DW_AT_ranges
+	.byte	2                               # Abbrev [2] 0x2a:0x19 DW_TAG_subprogram
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.long	.Linfo_string3                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	138                             # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x43:0x48 DW_TAG_subprogram
+	.quad	.Lfunc_begin1 + 1               # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.long	.Linfo_string5                  # DW_AT_linkage_name
+	.long	.Linfo_string6                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	138                             # DW_AT_type
+	.byte	4                               # Abbrev [4] 0x60:0xe DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	123
+	.long	.Linfo_string7                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	138                             # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x6e:0x1c DW_TAG_lexical_block
+	.long	.Ldebug_ranges0                 # DW_AT_ranges  <-- Manually modified replaced low_pc/high_pc to rangres.
+	.byte	6                               # Abbrev [6] 0x7b:0xe DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	116
+	.long	.Linfo_string9                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.long	138                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x8b:0x7 DW_TAG_base_type
+	.long	.Linfo_string4                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	7                               # Abbrev [7] 0x92:0x7 DW_TAG_base_type
+	.long	.Linfo_string8                  # DW_AT_name
+	.byte	2                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_ranges,"",@progbits
+.Ldebug_ranges0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_end0
+	.quad	.Lfunc_begin1
+	.quad	.Lfunc_end1
+	.quad	0
+	.quad	0
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 17.0.0 (https://github.com/llvm/llvm-project.git 73027ae39b1492e5b6033358a13b86d7d1e781ae)" # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=105
+.Linfo_string2:
+	.asciz	"base-lower-then-range-entry" # string offset=114
+.Linfo_string3:
+	.asciz	"main"                          # string offset=179
+.Linfo_string4:
+	.asciz	"int"                           # string offset=184
+.Linfo_string5:
+	.asciz	"_ZL3foob"                      # string offset=188
+.Linfo_string6:
+	.asciz	"foo"                           # string offset=197
+.Linfo_string7:
+	.asciz	"b"                             # string offset=201
+.Linfo_string8:
+	.asciz	"bool"                          # string offset=203
+.Linfo_string9:
+	.asciz	"food"                          # string offset=208
+	.ident	"clang version 17.0.0 (https://github.com/llvm/llvm-project.git 73027ae39b1492e5b6033358a13b86d7d1e781ae)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _ZL3foob
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:

From 088da8a0e57a461f3be4b554f28c4419418c097c Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 23 Mar 2023 11:28:49 -0700
Subject: [PATCH 174/208] [lldb][NFC] makeArrayRef -> ArrayRef

makeArrayRef is deprecated.
---
 lldb/source/Commands/CommandOptionsProcessAttach.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Commands/CommandOptionsProcessAttach.cpp b/lldb/source/Commands/CommandOptionsProcessAttach.cpp
index f9bd92938fa1c..d3d864dfe0255 100644
--- a/lldb/source/Commands/CommandOptionsProcessAttach.cpp
+++ b/lldb/source/Commands/CommandOptionsProcessAttach.cpp
@@ -72,5 +72,5 @@ Status CommandOptionsProcessAttach::SetOptionValue(
 }
 
 llvm::ArrayRef<OptionDefinition> CommandOptionsProcessAttach::GetDefinitions() {
-  return llvm::makeArrayRef(g_process_attach_options);
+  return llvm::ArrayRef(g_process_attach_options);
 }

From 805f51f9fedf90d2aa0ad46c61cb4c9c0c5bcfe9 Mon Sep 17 00:00:00 2001
From: AdityaK <1894981+hiraditya@users.noreply.github.com>
Date: Thu, 23 Mar 2023 13:54:58 -0700
Subject: [PATCH 175/208] Remove Android-mips related tests

Split from: https://reviews.llvm.org/D146565, already reviewed there.
---
 llvm/test/CodeGen/Mips/ehframe-indirect.ll |  4 --
 llvm/test/CodeGen/Mips/emutls_generic.ll   | 75 ----------------------
 2 files changed, 79 deletions(-)
 delete mode 100644 llvm/test/CodeGen/Mips/emutls_generic.ll

diff --git a/llvm/test/CodeGen/Mips/ehframe-indirect.ll b/llvm/test/CodeGen/Mips/ehframe-indirect.ll
index 59f358316ddfd..b3f4b48329d7b 100644
--- a/llvm/test/CodeGen/Mips/ehframe-indirect.ll
+++ b/llvm/test/CodeGen/Mips/ehframe-indirect.ll
@@ -1,13 +1,9 @@
 ; RUN: llc -mtriple=mipsel-linux-gnu < %s -asm-verbose -relocation-model=pic | \
 ; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-O32,O32 %s
-; RUN: llc -mtriple=mipsel-linux-android < %s -asm-verbose -relocation-model=pic | \
-; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-O32,O32 %s
 ; RUN: llc -mtriple=mips64el-linux-gnu -target-abi=n32 < %s -asm-verbose -relocation-model=pic | \
 ; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-NEW,N32 %s
 ; RUN: llc -mtriple=mips64el-linux-gnu < %s -asm-verbose -relocation-model=pic | \
 ; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-NEW,N64 %s
-; RUN: llc -mtriple=mips64el-linux-android < %s -asm-verbose -relocation-model=pic | \
-; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-NEW,N64 %s
 ; RUN: llc -mtriple=mips64el-linux-gnu < %s -asm-verbose -relocation-model=pic | \
 ; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-NEW,N64 %s
 ; RUN: llc -mtriple=mips-unknown-freebsd11.0 < %s -asm-verbose -relocation-model=pic | \
diff --git a/llvm/test/CodeGen/Mips/emutls_generic.ll b/llvm/test/CodeGen/Mips/emutls_generic.ll
deleted file mode 100644
index 344a581d6b4b7..0000000000000
--- a/llvm/test/CodeGen/Mips/emutls_generic.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: llc < %s -emulated-tls -mtriple=mipsel-linux-android -relocation-model=pic \
-; RUN:     | FileCheck -check-prefix=MIPS_32 %s
-; RUN: llc < %s -emulated-tls -mtriple=mips64el-linux-android -relocation-model=pic \
-; RUN:     | FileCheck -check-prefix=MIPS_64 %s
-
-; RUN: llc < %s -mtriple=mipsel-linux-android -relocation-model=pic \
-; RUN:     | FileCheck -check-prefix=MIPS_32 %s
-; RUN: llc < %s -mtriple=mips64el-linux-android -relocation-model=pic \
-; RUN:     | FileCheck -check-prefix=MIPS_64 %s
-
-; Make sure that TLS symbols are emitted in expected order.
-
-@external_x = external thread_local global i32, align 8
-@external_y = thread_local global i8 7, align 2
-@internal_y = internal thread_local global i64 9, align 16
-
-define ptr @get_external_x() {
-entry:
-  ret ptr @external_x
-}
-
-define ptr @get_external_y() {
-entry:
-  ret ptr @external_y
-}
-
-define ptr @get_internal_y() {
-entry:
-  ret ptr @internal_y
-}
-
-; MIPS_32-LABEL: get_external_y:
-; MIPS_32-LABEL: get_internal_y:
-; MIPS_32:     lw {{.+}}(__emutls_v.internal_y
-; MIPS_32:     lw {{.+}}call16(__emutls_get_address
-; MIPS_32-NOT:  __emutls_t.external_x
-; MIPS_32-NOT:  __emutls_v.external_x:
-; MIPS_32:       .data
-; MIPS_32:       .p2align 2
-; MIPS_32-LABEL: __emutls_v.external_y:
-; MIPS_32:       .section .rodata,
-; MIPS_32-LABEL: __emutls_t.external_y:
-; MIPS_32-NEXT:  .byte 7
-; MIPS_32:       .data
-; MIPS_32:       .p2align 2
-; MIPS_32-LABEL: __emutls_v.internal_y:
-; MIPS_32-NEXT:  .4byte 8
-; MIPS_32-NEXT:  .4byte 16
-; MIPS_32-NEXT:  .4byte 0
-; MIPS_32-NEXT:  .4byte __emutls_t.internal_y
-; MIPS_32-LABEL: __emutls_t.internal_y:
-; MIPS_32-NEXT:  .8byte 9
-
-; MIPS_64-LABEL: get_external_x:
-; MIPS_64-LABEL: get_external_y:
-; MIPS_64-LABEL: get_internal_y:
-; MIPS_64:     ld {{.+}}(__emutls_v.internal_y
-; MIPS_64:     ld {{.+}}call16(__emutls_get_address
-; MIPS_64-NOT:  __emutls_t.external_x
-; MIPS_64-NOT:  __emutls_v.external_x:
-; MIPS_64-LABEL: __emutls_v.external_y:
-; MIPS_64-NOT:   __emutls_v.external_x:
-; MIPS_64:       .section .rodata,
-; MIPS_64-LABEL: __emutls_t.external_y:
-; MIPS_64-NEXT:  .byte 7
-; MIPS_64:       .data
-; MIPS_64:       .p2align 3
-; MIPS_64-LABEL: __emutls_v.internal_y:
-; MIPS_64-NEXT:  .8byte 8
-; MIPS_64-NEXT:  .8byte 16
-; MIPS_64-NEXT:  .8byte 0
-; MIPS_64-NEXT:  .8byte __emutls_t.internal_y
-; MIPS_64:       .section .rodata,
-; MIPS_64-LABEL: __emutls_t.internal_y:
-; MIPS_64-NEXT:  .8byte 9

From 1c9173365a932a0d289ec86704ec645a138de03e Mon Sep 17 00:00:00 2001
From: NagaChaitanya Vellanki <pnagato@protonmail.com>
Date: Thu, 23 Mar 2023 14:16:25 -0700
Subject: [PATCH 176/208] Fix highlighting issue with _complex and
 initialization list with more than 2 items

Fixes https://github.com/llvm/llvm-project/issues/61518

Reviewed By: aaron.ballman

Differential Revision: https://reviews.llvm.org/D146503
---
 clang/docs/ReleaseNotes.rst                  |  2 +
 clang/lib/Sema/SemaInit.cpp                  |  2 +-
 clang/test/Sema/caret-diags-complex-init.cpp | 39 ++++++++++++++++++++
 clang/test/Sema/complex-init-list.c          | 18 +++++++--
 4 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/Sema/caret-diags-complex-init.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 005bf99a62457..faac3b17b223f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -225,6 +225,8 @@ Bug Fixes in This Version
   enabling short-circuiting coroutines use cases. This fixes
   (`#56532 <https://github.com/llvm/llvm-project/issues/56532>`_) in
   antecipation of `CWG2563 <https://cplusplus.github.io/CWG/issues/2563.html>_`.
+- Fix highlighting issue with ``_Complex`` and initialization list with more than
+  2 items. (`#61518 <https://github.com/llvm/llvm-project/issues/61518>`_)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 17d8b6c98207b..46517c9dde06a 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -1536,7 +1536,7 @@ void InitListChecker::CheckComplexType(const InitializedEntity &Entity,
   // the element type of the complex type. The first element initializes
   // the real part, and the second element intitializes the imaginary part.
 
-  if (IList->getNumInits() != 2)
+  if (IList->getNumInits() < 2)
     return CheckScalarType(Entity, IList, DeclType, Index, StructuredList,
                            StructuredIndex);
 
diff --git a/clang/test/Sema/caret-diags-complex-init.cpp b/clang/test/Sema/caret-diags-complex-init.cpp
new file mode 100644
index 0000000000000..d8a1b7837a640
--- /dev/null
+++ b/clang/test/Sema/caret-diags-complex-init.cpp
@@ -0,0 +1,39 @@
+// RUN: not %clang_cc1 -std=c++11 -fsyntax-only -fcaret-diagnostics-max-lines 5 %s 2>&1 | FileCheck %s -strict-whitespace
+
+
+//CHECK: {{.*}}: error: excess elements in scalar initializer
+//CHECK-NEXT: {{^}}_Complex double gz1 = {1, 2, 3};
+//CHECK-NEXT: {{^}}                             ^{{$}}
+_Complex double gz1 = {1, 2, 3}; 
+
+//CHECK: {{.*}}: error: excess elements in scalar initializer
+//CHECK-NEXT: {{^}}_Complex double dd = {1.0, 2.0, 3.0};
+//CHECK-NEXT: {{^}}                                ^~~{{$}}
+_Complex double dd = {1.0, 2.0, 3.0};
+
+//CHECK: {{.*}}: error: excess elements in scalar initializer
+//CHECK-NEXT: {{^}}_Complex float fd = {1.0, 2.0, 3.0, 4.0, 5.0};
+//CHECK-NEXT: {{^}}                               ^~~{{$}}
+_Complex float fd = {1.0, 2.0, 3.0, 4.0, 5.0};
+
+//CHECK: {{.*}}: error: no viable conversion from 'foo' to 'double'
+//CHECK-NEXT: {{^}}_Complex double ds = {f, 1.0, b};
+//CHECK-NEXT: {{^}}                      ^{{$}}
+struct foo{};
+struct bar{};
+
+foo f;
+bar b;
+_Complex double ds = {f, 1.0, b};
+
+//CHECK: {{.*}}: error: no viable conversion from 'foo' to 'double'
+//CHECK-NEXT: {{^}}_Complex double fg = {1.0, f};
+//CHECK-NEXT: {{^}}                           ^{{$}}
+_Complex double fg = {1.0, f};
+
+
+//CHECK: {{.*}}: error: excess elements in scalar initializer
+//CHECK-NEXT: {{^}}_Complex double gg = {1.0, 2.0, f};
+//CHECK-NEXT: {{^}}                                ^{{$}}
+//CHECK-NEXT: {{^}}6 errors generated.
+_Complex double gg = {1.0, 2.0, f};
diff --git a/clang/test/Sema/complex-init-list.c b/clang/test/Sema/complex-init-list.c
index bfc6899ac235d..b8f87f57f0793 100644
--- a/clang/test/Sema/complex-init-list.c
+++ b/clang/test/Sema/complex-init-list.c
@@ -25,17 +25,21 @@ struct teststruct { _Complex float x; };
 
 
 // Random other valid stuff
-_Complex int valid2 = { 1, 2 }; // expected-warning {{complex integer}} expected-warning {{specifying real and imaginary components is an extension}}
+_Complex int valid2 = { 1, 2 }; // expected-warning {{complex integer}} \
+                               //  expected-warning {{specifying real and imaginary components is an extension}}
 struct teststruct valid3 = { { 1.0f, 2.0f} }; // expected-warning {{specifying real and imaginary components is an extension}}
 _Complex float valid4[2] = { {1.0f, 1.0f}, {1.0f, 1.0f} }; // expected-warning 2 {{specifying real and imaginary components is an extension}}
 // FIXME: We need some sort of warning for valid5
-_Complex float valid5 = {1.0f, 1.0fi}; // expected-warning {{imaginary constants}} expected-warning {{specifying real and imaginary components is an extension}}
+_Complex float valid5 = {1.0f, 1.0fi}; // expected-warning {{imaginary constants}} \
+                                       // expected-warning {{specifying real and imaginary components is an extension}}
 
 
 // Random invalid stuff
 struct teststruct invalid1 = { 1, 2 }; // expected-warning {{excess elements}}
-_Complex float invalid2 = { 1, 2, 3 }; // expected-warning {{excess elements}}
-_Complex float invalid3 = {}; // expected-error {{scalar initializer cannot be empty}} expected-warning {{GNU empty initializer}}
+_Complex float invalid2 = { 1, 2, 3 }; // expected-warning {{specifying real and imaginary components is an extension}} \
+                                       // expected-warning {{excess elements in scalar initializer}}
+_Complex float invalid3 = {}; // expected-error {{scalar initializer cannot be empty}} \
+                             //  expected-warning {{GNU empty initializer}}
 
 
 // Check incomplete array sizing
@@ -46,3 +50,9 @@ _Complex float sizecheck2[(sizeof(sizetest2) == sizeof(*sizetest2)*3) ? 1 : -1];
 
 // Constant-folding with init list.
 _Complex float x = 2 + (_Complex float) { 1, 2 };  // expected-warning {{specifying real and imaginary components is an extension}}
+
+// initialization list
+_Complex double cd = {1.0, 2.0, 3.0}; // expected-warning {{specifying real and imaginary components is an extension}} \
+                                     //  expected-warning {{excess elements in scalar initializer}}
+_Complex float cf = {1.1f, 2.2f, 3.3f, 4.4f}; // expected-warning {{specifying real and imaginary components is an extension}} \
+                                             //  expected-warning {{excess elements in scalar initializer}}

From 3111784ff7d3d51a9e981b1a0bbc8f6511c34d25 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Tue, 6 Dec 2022 16:49:13 -0800
Subject: [PATCH 177/208] [lld][WebAssembly] Initial support for stub libraries

See the docs in lld/docs/WebAssembly.rst for more on this.

This feature unlocks a lot of simplification in the emscripten toolchain
since we can represent the JS libraries to wasm-ld as stub libraries.

See https://github.com/emscripten-core/emscripten/issues/18875

Differential Revision: https://reviews.llvm.org/D145308
---
 lld/docs/WebAssembly.rst                    | 33 +++++++++++++
 lld/test/wasm/Inputs/libstub-missing-dep.so |  2 +
 lld/test/wasm/Inputs/libstub-missing-sym.so |  3 ++
 lld/test/wasm/Inputs/libstub.so             |  5 ++
 lld/test/wasm/stub_library.s                | 48 ++++++++++++++++++
 lld/wasm/Driver.cpp                         | 55 +++++++++++++++++++++
 lld/wasm/InputFiles.cpp                     | 43 ++++++++++++++++
 lld/wasm/InputFiles.h                       | 13 +++++
 lld/wasm/Relocations.cpp                    |  4 +-
 lld/wasm/SymbolTable.cpp                    |  7 +++
 lld/wasm/SymbolTable.h                      |  1 +
 lld/wasm/Symbols.cpp                        |  4 ++
 lld/wasm/Symbols.h                          |  7 ++-
 lld/wasm/Writer.cpp                         |  4 +-
 14 files changed, 224 insertions(+), 5 deletions(-)
 create mode 100644 lld/test/wasm/Inputs/libstub-missing-dep.so
 create mode 100644 lld/test/wasm/Inputs/libstub-missing-sym.so
 create mode 100644 lld/test/wasm/Inputs/libstub.so
 create mode 100644 lld/test/wasm/stub_library.s

diff --git a/lld/docs/WebAssembly.rst b/lld/docs/WebAssembly.rst
index c40d4b322080a..dad3177e2c7df 100644
--- a/lld/docs/WebAssembly.rst
+++ b/lld/docs/WebAssembly.rst
@@ -75,6 +75,11 @@ WebAssembly-specific options:
   flag which corresponds to ``--unresolve-symbols=ignore`` +
   ``--import-undefined``.
 
+.. option:: --allow-undefined-file=<filename>
+
+  Like ``--allow-undefined``, but the filename specified a flat list of
+  symbols, one per line, which are allowed to be undefined.
+
 .. option:: --unresolved-symbols=<method>
 
   This is a more full featured version of ``--allow-undefined``.
@@ -182,11 +187,39 @@ Imports
 By default no undefined symbols are allowed in the final binary.  The flag
 ``--allow-undefined`` results in a WebAssembly import being defined for each
 undefined symbol.  It is then up to the runtime to provide such symbols.
+``--allow-undefined-file`` is the same but allows a list of symbols to be
+specified.
 
 Alternatively symbols can be marked in the source code as with the
 ``import_name`` and/or ``import_module`` clang attributes which signals that
 they are expected to be undefined at static link time.
 
+Stub Libraries
+~~~~~~~~~~~~~~
+
+Another way to specify imports and exports is via a "stub library".  This
+feature is inspired by the ELF stub objects which are supported by the Solaris
+linker.  Stub libraries are text files that can be passed as normal linker
+inputs, similar to how linker scripts can be passed to the ELF linker.  The stub
+library is a stand-in for a set of symbols that will be available at runtime,
+but doesn't contain any actual code or data.  Instead it contains just a list of
+symbols, one per line.  Each symbol can specify zero or more dependencies.
+These dependencies are symbols that must be defined, and exported, by the output
+module if the symbol is question is imported/required by the output module.
+
+For example, imagine the runtime provides an external symbol ``foo`` that
+depends on the ``malloc`` and ``free``.  This can be expressed simply as::
+
+  #STUB
+  foo: malloc,free
+
+Here we are saying that ``foo`` is allowed to be imported (undefined) but that
+if it is imported, then the output module must also export ``malloc`` and
+``free`` to the runtime.  If ``foo`` is imported (undefined), but the output
+module does not define ``malloc`` and ``free`` then the link will fail.
+
+Stub libraries must begin with ``#STUB`` on a line by itself.
+
 Garbage Collection
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/lld/test/wasm/Inputs/libstub-missing-dep.so b/lld/test/wasm/Inputs/libstub-missing-dep.so
new file mode 100644
index 0000000000000..f2345b766f099
--- /dev/null
+++ b/lld/test/wasm/Inputs/libstub-missing-dep.so
@@ -0,0 +1,2 @@
+#STUB
+foo: missing_dep,missing_dep2
diff --git a/lld/test/wasm/Inputs/libstub-missing-sym.so b/lld/test/wasm/Inputs/libstub-missing-sym.so
new file mode 100644
index 0000000000000..2120b948511e9
--- /dev/null
+++ b/lld/test/wasm/Inputs/libstub-missing-sym.so
@@ -0,0 +1,3 @@
+#STUB
+# Symbol `foo` is missing from this file which causes stub_object.s to fail
+bar
diff --git a/lld/test/wasm/Inputs/libstub.so b/lld/test/wasm/Inputs/libstub.so
new file mode 100644
index 0000000000000..57e61f632b101
--- /dev/null
+++ b/lld/test/wasm/Inputs/libstub.so
@@ -0,0 +1,5 @@
+#STUB
+# This is a comment
+foo: foodep1,foodep2
+# This symbols as no dependencies
+bar
diff --git a/lld/test/wasm/stub_library.s b/lld/test/wasm/stub_library.s
new file mode 100644
index 0000000000000..9cbf2505ea9e7
--- /dev/null
+++ b/lld/test/wasm/stub_library.s
@@ -0,0 +1,48 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld %t.o %p/Inputs/libstub.so -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+# When the dependencies are missing the link fails
+# RUN: not wasm-ld %t.o %p/Inputs/libstub-missing-dep.so -o %t.wasm 2>&1 | FileCheck --check-prefix=MISSING-DEP %s
+
+# When the dependencies are missing the link fails
+# RUN: not wasm-ld %t.o %p/Inputs/libstub-missing-sym.so -o %t.wasm 2>&1 | FileCheck --check-prefix=MISSING-SYM %s
+
+# MISSING-DEP: libstub-missing-dep.so: undefined symbol: missing_dep. Required by foo
+# MISSING-DEP: libstub-missing-dep.so: undefined symbol: missing_dep2. Required by foo
+
+# MISSING-SYM: undefined symbol: foo
+
+# The function foo is defined in libstub.so but depend on foodep1 and foodep2
+.functype foo () -> ()
+
+.globl foodep1
+foodep1:
+  .functype foodep1 () -> ()
+  end_function
+
+.globl foodep2
+foodep2:
+  .functype foodep2 () -> ()
+  end_function
+
+.globl _start
+_start:
+    .functype _start () -> ()
+    call foo
+    end_function
+
+# CHECK:       - Type:            EXPORT
+# CHECK-NEXT:    Exports:
+# CHECK-NEXT:      - Name:            memory
+# CHECK-NEXT:        Kind:            MEMORY
+# CHECK-NEXT:        Index:           0
+# CHECK-NEXT:      - Name:            foodep1
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        Index:           1
+# CHECK-NEXT:      - Name:            foodep2
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        Index:           2
+# CHECK-NEXT:      - Name:            _start
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        Index:           3
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 310f9df2d5b68..68cd8cabbd7f2 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -279,6 +279,12 @@ void LinkerDriver::addFile(StringRef path) {
   case file_magic::wasm_object:
     files.push_back(createObjectFile(mbref));
     break;
+  case file_magic::unknown:
+    if (mbref.getBuffer().starts_with("#STUB\n")) {
+      files.push_back(make<StubFile>(mbref));
+      break;
+    }
+    [[fallthrough]];
   default:
     error("unknown file type: " + mbref.getBufferIdentifier());
   }
@@ -868,6 +874,53 @@ static void createOptionalSymbols() {
     WasmSym::tlsBase = createOptionalGlobal("__tls_base", false);
 }
 
+static void processStubLibraries() {
+  log("-- processStubLibraries");
+  for (auto &stub_file : symtab->stubFiles) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "processing stub file: " << stub_file->getName() << "\n");
+    for (auto [name, deps]: stub_file->symbolDependencies) {
+      auto* sym = symtab->find(name);
+      if (!sym || !sym->isUndefined() || !sym->isUsedInRegularObj ||
+          sym->forceImport) {
+        LLVM_DEBUG(llvm::dbgs() << "stub not in needed: " << name << "\n");
+        continue;
+      }
+      // The first stub library to define a given symbol sets this and
+      // definitions in later stub libraries are ignored.
+      sym->forceImport = true;
+      if (sym->traced)
+        message(toString(stub_file) + ": importing " + name);
+      else
+        LLVM_DEBUG(llvm::dbgs()
+                   << toString(stub_file) << ": importing " << name << "\n");
+      for (const auto dep : deps) {
+        auto* needed = symtab->find(dep);
+        if (!needed) {
+          error(toString(stub_file) + ": undefined symbol: " + dep +
+                ". Required by " + toString(*sym));
+        } else if (needed->isUndefined()) {
+          error(toString(stub_file) +
+                ": undefined symbol: " + toString(*needed) +
+                ". Required by " + toString(*sym));
+        } else {
+          LLVM_DEBUG(llvm::dbgs()
+                     << "force export: " << toString(*needed) << "\n");
+          needed->forceExport = true;
+          needed->isUsedInRegularObj = true;
+          if (auto *lazy = dyn_cast<LazySymbol>(needed)) {
+            lazy->fetch();
+            if (!config->whyExtract.empty())
+              config->whyExtractRecords.emplace_back(stub_file->getName(),
+                                                     sym->getFile(), *sym);
+          }
+        }
+      }
+    }
+  }
+  log("-- done processStubLibraries");
+}
+
 // Reconstructs command line arguments so that so that you can re-run
 // the same command with the same inputs. This is for --reproduce.
 static std::string createResponseFile(const opt::InputArgList &args) {
@@ -1166,6 +1219,8 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   if (errorCount())
     return;
 
+  processStubLibraries();
+
   createOptionalSymbols();
 
   // Resolve any variant symbols that were created due to signature
diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp
index 75760293bbaae..2d9768c768f29 100644
--- a/lld/wasm/InputFiles.cpp
+++ b/lld/wasm/InputFiles.cpp
@@ -12,6 +12,7 @@
 #include "InputElement.h"
 #include "OutputSegment.h"
 #include "SymbolTable.h"
+#include "lld/Common/Args.h"
 #include "lld/Common/CommonLinkerContext.h"
 #include "lld/Common/Reproduce.h"
 #include "llvm/Object/Binary.h"
@@ -678,6 +679,48 @@ Symbol *ObjFile::createUndefined(const WasmSymbol &sym, bool isCalledDirectly) {
   llvm_unreachable("unknown symbol kind");
 }
 
+
+StringRef strip(StringRef s) {
+  while (s.starts_with(" ")) {
+    s = s.drop_front();
+  }
+  while (s.ends_with(" ")) {
+    s = s.drop_back();
+  }
+  return s;
+}
+
+void StubFile::parse() {
+  bool first = false;
+
+  for (StringRef line : args::getLines(mb)) {
+    // File must begin with #STUB
+    if (first) {
+      assert(line == "#STUB\n");
+      first = false;
+    }
+
+    // Lines starting with # are considered comments
+    if (line.startswith("#"))
+      continue;
+
+    StringRef sym;
+    StringRef rest;
+    std::tie(sym, rest) = line.split(':');
+    sym = strip(sym);
+    rest = strip(rest);
+
+    symbolDependencies[sym] = {};
+
+    while (rest.size()) {
+      StringRef first;
+      std::tie(first, rest) = rest.split(',');
+      first = strip(first);
+      symbolDependencies[sym].push_back(first);
+    }
+  }
+}
+
 void ArchiveFile::parse() {
   // Parse a MemoryBufferRef as an archive file.
   LLVM_DEBUG(dbgs() << "Parsing library: " << toString(this) << "\n");
diff --git a/lld/wasm/InputFiles.h b/lld/wasm/InputFiles.h
index c72f64cb2bd04..11cee5405b657 100644
--- a/lld/wasm/InputFiles.h
+++ b/lld/wasm/InputFiles.h
@@ -47,6 +47,7 @@ class InputFile {
     SharedKind,
     ArchiveKind,
     BitcodeKind,
+    StubKind,
   };
 
   virtual ~InputFile() {}
@@ -183,6 +184,18 @@ class BitcodeFile : public InputFile {
   static bool doneLTO;
 };
 
+// Stub libray (See docs/WebAssembly.rst)
+class StubFile : public InputFile {
+public:
+  explicit StubFile(MemoryBufferRef m) : InputFile(StubKind, m) {}
+
+  static bool classof(const InputFile *f) { return f->kind() == StubKind; }
+
+  void parse();
+
+  llvm::DenseMap<StringRef, std::vector<StringRef>> symbolDependencies;
+};
+
 inline bool isBitcode(MemoryBufferRef mb) {
   return identify_magic(mb.getBuffer()) == llvm::file_magic::bitcode;
 }
diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp
index 2f6dd6af2d030..ce41cdcb3e07f 100644
--- a/lld/wasm/Relocations.cpp
+++ b/lld/wasm/Relocations.cpp
@@ -32,9 +32,9 @@ static bool requiresGOTAccess(const Symbol *sym) {
 }
 
 static bool allowUndefined(const Symbol* sym) {
-  // Symbols with explicit import names are always allowed to be undefined at
+  // Symbols that are explicitly imported are always allowed to be undefined at
   // link time.
-  if (sym->importName)
+  if (sym->isImported())
     return true;
   if (isa<UndefinedFunction>(sym) && config->importUndefined)
     return true;
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index 881b1231ffdf9..d33176a0fa54a 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -38,6 +38,13 @@ void SymbolTable::addFile(InputFile *file) {
     return;
   }
 
+  // stub file
+  if (auto *f = dyn_cast<StubFile>(file)) {
+    f->parse();
+    stubFiles.push_back(f);
+    return;
+  }
+
   if (config->trace)
     message(toString(file));
 
diff --git a/lld/wasm/SymbolTable.h b/lld/wasm/SymbolTable.h
index 5009e6039602b..ef2a023b68c44 100644
--- a/lld/wasm/SymbolTable.h
+++ b/lld/wasm/SymbolTable.h
@@ -102,6 +102,7 @@ class SymbolTable {
   DefinedFunction *createUndefinedStub(const WasmSignature &sig);
 
   std::vector<ObjFile *> objectFiles;
+  std::vector<StubFile *> stubFiles;
   std::vector<SharedFile *> sharedFiles;
   std::vector<BitcodeFile *> bitcodeFiles;
   std::vector<InputFunction *> syntheticFunctions;
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index 8864e840dd585..567ff49dfa444 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -221,6 +221,10 @@ void Symbol::setHidden(bool isHidden) {
     flags |= WASM_SYMBOL_VISIBILITY_DEFAULT;
 }
 
+bool Symbol::isImported() const {
+  return isUndefined() && (importName.has_value() || forceImport);
+}
+
 bool Symbol::isExported() const {
   if (!isDefined() || isLocal())
     return false;
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 16f1b535876e0..34fff4b962bdc 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -114,6 +114,7 @@ class Symbol {
   void setOutputSymbolIndex(uint32_t index);
 
   WasmSymbolType getWasmType() const;
+  bool isImported() const;
   bool isExported() const;
   bool isExportedExplicit() const;
 
@@ -135,7 +136,8 @@ class Symbol {
   Symbol(StringRef name, Kind k, uint32_t flags, InputFile *f)
       : name(name), file(f), symbolKind(k), referenced(!config->gcSections),
         requiresGOT(false), isUsedInRegularObj(false), forceExport(false),
-        canInline(false), traced(false), isStub(false), flags(flags) {}
+        forceImport(false), canInline(false), traced(false), isStub(false),
+        flags(flags) {}
 
   StringRef name;
   InputFile *file;
@@ -160,6 +162,8 @@ class Symbol {
   // -e/--export command line flag)
   bool forceExport : 1;
 
+  bool forceImport : 1;
+
   // False if LTO shouldn't inline whatever this symbol points to. If a symbol
   // is overwritten after LTO, LTO shouldn't inline the symbol because it
   // doesn't know the final contents of the symbol.
@@ -661,6 +665,7 @@ T *replaceSymbol(Symbol *s, ArgT &&... arg) {
   T *s2 = new (s) T(std::forward<ArgT>(arg)...);
   s2->isUsedInRegularObj = symCopy.isUsedInRegularObj;
   s2->forceExport = symCopy.forceExport;
+  s2->forceImport = symCopy.forceImport;
   s2->canInline = symCopy.canInline;
   s2->traced = symCopy.traced;
   s2->referenced = symCopy.referenced;
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 030ef7468791a..d9e87276b31b0 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -744,7 +744,7 @@ static bool shouldImport(Symbol *sym) {
   if (config->allowUndefinedSymbols.count(sym->getName()) != 0)
     return true;
 
-  return sym->importName.has_value();
+  return sym->isImported();
 }
 
 void Writer::calculateImports() {
@@ -1709,7 +1709,7 @@ void Writer::run() {
       sym->forceExport = true;
   }
 
-  // Delay reporting error about explicit exports until after
+  // Delay reporting errors about explicit exports until after
   // addStartStopSymbols which can create optional symbols.
   for (auto &name : config->requiredExports) {
     Symbol *sym = symtab->find(name);

From af54d1e852850edcc7b9485851320d9ebf1be4fe Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Thu, 23 Mar 2023 14:15:01 -0500
Subject: [PATCH 178/208] [NVPTX] Set the atomic inling threshold when
 targeting NVPTX directly

Since Clang 16.0.0 users can target the `NVPTX` architecture directly
via `--target=nvptx64-nvidia-cuda`. However, this does not set the
atomic inlining size correctly. This leads to spurious warnings and
emission of runtime atomics that are never implemented. This patch
ensures that we set this to the appropriate pointer width. This will
always be 64 in the future as `nvptx64` will only be supported moving
forward.

Fixes: https://github.com/llvm/llvm-project/issues/61410

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D146750
---
 clang/lib/Basic/Targets/NVPTX.cpp     |  2 ++
 clang/test/CodeGen/atomics-inlining.c | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index bacd93ee1c379..aca51b2b95b59 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -93,6 +93,8 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
     default:
       llvm_unreachable("TargetPointerWidth must be 32 or 64");
     }
+
+    MaxAtomicInlineWidth = TargetPointerWidth;
     return;
   }
 
diff --git a/clang/test/CodeGen/atomics-inlining.c b/clang/test/CodeGen/atomics-inlining.c
index ade0e3d75bcb0..862c63076b2dc 100644
--- a/clang/test/CodeGen/atomics-inlining.c
+++ b/clang/test/CodeGen/atomics-inlining.c
@@ -8,6 +8,7 @@
 // RUN: %clang_cc1 -triple mipsisa64r6el-linux-gnuabi64 -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS64
 // RUN: %clang_cc1 -triple sparc-unknown-eabi -emit-llvm %s -o - | FileCheck %s -check-prefix=SPARCV8 -check-prefix=SPARC
 // RUN: %clang_cc1 -triple sparcv9-unknown-eabi -emit-llvm %s -o - | FileCheck %s -check-prefix=SPARCV9 -check-prefix=SPARC
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s -check-prefix=NVPTX
 
 unsigned char c1, c2;
 unsigned short s1, s2;
@@ -109,4 +110,17 @@ void test1(void) {
 // SPARCV9: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // SPARCV8: call void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // SPARCV8: call void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
+
+// NVPTX-LABEL: define{{.*}} void @test1
+// NVPTX: = load atomic i8, ptr @c1 seq_cst, align 1
+// NVPTX: store atomic i8 {{.*}}, ptr @c1 seq_cst, align 1
+// NVPTX: = load atomic i16, ptr @s1 seq_cst, align 2
+// NVPTX: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
+// NVPTX: = load atomic i32, ptr @i1 seq_cst, align 4
+// NVPTX: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
+// NVPTX: = load atomic i64, ptr @ll1 seq_cst, align 8
+// NVPTX: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
+// NVPTX: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2, i32 noundef 5)
+// NVPTX: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2, i32 noundef 5)
+
 }

From d11e49f0c8c52d715cd0f7dea436ac5e0dce9c42 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Thu, 23 Mar 2023 16:30:31 -0500
Subject: [PATCH 179/208] [libc][NFC] Fix misspelled variable name in cmake
 message

---
 libc/cmake/modules/prepare_libc_gpu_build.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 1f9b68afd95cd..fe0f4ef4826d5 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -45,7 +45,7 @@ endif()
 set(LIBC_GPU_TEST_ARCHITECTURE "" CACHE STRING "Architecture for the GPU tests")
 if(LIBC_GPU_TEST_ARCHITECTURE)
   message(STATUS "Using user-specified GPU architecture for testing "
-                 "'${LIBC_GPU_TARGET_ARCHITECTURE}'")
+                 "'${LIBC_GPU_TEST_ARCHITECTURE}'")
   if("${LIBC_GPU_TEST_ARCHITECTURE}" IN_LIST all_amdgpu_architectures)
     set(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU TRUE)
     set(LIBC_GPU_TARGET_TRIPLE "amdgcn-amd-amdhsa")

From 9ddc03a17dba1d7aaad73067325344f6b79441b0 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Thu, 23 Mar 2023 16:41:25 -0500
Subject: [PATCH 180/208] [OpenMP] Fix test after updating NVPTX atomic inlines

Summary:
The previous patch fixed how we handle emitting atomics for targeting
NVPTX directly. This is the only other file that really does that and
has atomics and I forgot to update it.
---
 .../OpenMP/nvptx_nested_parallel_codegen.cpp  | 36 +++++--------------
 1 file changed, 8 insertions(+), 28 deletions(-)

diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
index c5c31c601ed39..010cbae25b9af 100644
--- a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
@@ -45,7 +45,7 @@ int main() {
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    call void @_Z3usePi(ptr noundef [[TMP0]]) #[[ATTR7:[0-9]+]]
+// CHECK1-NEXT:    call void @_Z3usePi(ptr noundef [[TMP0]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP3]], align 8
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 2, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
@@ -78,7 +78,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @_Z3usePi(ptr noundef [[TMP0]]) #[[ATTR7]]
+// CHECK1-NEXT:    call void @_Z3usePi(ptr noundef [[TMP0]]) #[[ATTR6]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -111,7 +111,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    call void @_Z4workPi(ptr noundef [[TMP1]]) #[[ATTR7]]
+// CHECK1-NEXT:    call void @_Z4workPi(ptr noundef [[TMP1]]) #[[ATTR6]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -119,19 +119,9 @@ int main() {
 // CHECK1-SAME: (ptr noundef [[C:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @__atomic_load(i64 noundef 4, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 0) #[[ATTR7]]
-// CHECK1-NEXT:    br label [[ATOMIC_CONT:%.*]]
-// CHECK1:       atomic_cont:
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[ATOMIC_TEMP1]], align 4
-// CHECK1-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i64 noundef 4, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 0, i32 noundef 0) #[[ATTR7]]
-// CHECK1-NEXT:    br i1 [[CALL]], label [[ATOMIC_EXIT:%.*]], label [[ATOMIC_CONT]]
-// CHECK1:       atomic_exit:
+// CHECK1-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4
 // CHECK1-NEXT:    ret void
 //
 //
@@ -165,7 +155,7 @@ int main() {
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK2-NEXT:    call void @_Z3usePi(ptr noundef [[TMP0]]) #[[ATTR7:[0-9]+]]
+// CHECK2-NEXT:    call void @_Z3usePi(ptr noundef [[TMP0]]) #[[ATTR6:[0-9]+]]
 // CHECK2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP3]], align 4
 // CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 2, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper, ptr [[CAPTURED_VARS_ADDRS]], i32 1)
@@ -198,7 +188,7 @@ int main() {
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK2-NEXT:    call void @_Z3usePi(ptr noundef [[TMP0]]) #[[ATTR7]]
+// CHECK2-NEXT:    call void @_Z3usePi(ptr noundef [[TMP0]]) #[[ATTR6]]
 // CHECK2-NEXT:    ret void
 //
 //
@@ -231,7 +221,7 @@ int main() {
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CHECK2-NEXT:    call void @_Z4workPi(ptr noundef [[TMP1]]) #[[ATTR7]]
+// CHECK2-NEXT:    call void @_Z4workPi(ptr noundef [[TMP1]]) #[[ATTR6]]
 // CHECK2-NEXT:    ret void
 //
 //
@@ -239,19 +229,9 @@ int main() {
 // CHECK2-SAME: (ptr noundef [[C:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
-// CHECK2-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK2-NEXT:    call void @__atomic_load(i32 noundef 4, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 0) #[[ATTR7]]
-// CHECK2-NEXT:    br label [[ATOMIC_CONT:%.*]]
-// CHECK2:       atomic_cont:
-// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
-// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[ATOMIC_TEMP1]], align 4
-// CHECK2-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i32 noundef 4, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 0, i32 noundef 0) #[[ATTR7]]
-// CHECK2-NEXT:    br i1 [[CALL]], label [[ATOMIC_EXIT:%.*]], label [[ATOMIC_CONT]]
-// CHECK2:       atomic_exit:
+// CHECK2-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4
 // CHECK2-NEXT:    ret void
 //
 //

From 53a917595186d711026505dbc42b95aca5a67825 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Thu, 23 Mar 2023 21:44:59 +0000
Subject: [PATCH 181/208] [llvm] Handle duplicate call bases when applying
 branch funneling

It's possible to segfault in `DevirtModule::applyICallBranchFunnel` when
attempting to call `getCaller` on a call base that was erased in a prior
iteration. This can occur when attempting to find devirtualizable calls
via `findDevirtualizableCallsForTypeTest` if the vtable passed to
llvm.type.test is a global and not a local. The function works by taking
the first argument of the llvm.type.test call (which is a vtable),
iterating through all uses of it, and adding any relevant all uses that
are calls associated with that intrinsic call to a vector. For most
cases where the vtable is actually a *local*, this wouldn't be an issue.
Take for example:

```
define i32 @fn(ptr %obj) #0 {
  %vtable = load ptr, ptr %obj
  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid2")
  call void @llvm.assume(i1 %p)
  %fptr = load ptr, ptr %vtable
  %result = call i32 %fptr(ptr %obj, i32 1)
  ret i32 %result
}
```

`findDevirtualizableCallsForTypeTest` will check the call base ` %result
= call i32 %fptr(ptr %obj, i32 1)`, find that it is associated with a
virtualizable call from `%vtable`, find all loads for `%vtable`, and add
any instances those load results are called into a vector. Now consider
the case where instead `%vtable` was the global itself rather than a
local:

```
define i32 @fn(ptr %obj) #0 {
  %p = call i1 @llvm.type.test(ptr @vtable, metadata !"typeid2")
  call void @llvm.assume(i1 %p)
  %fptr = load ptr, ptr @vtable
  %result = call i32 %fptr(ptr %obj, i32 1)
  ret i32 %result
}
```

`findDevirtualizableCallsForTypeTest` should work normally and add one
unique call instance to a vector. However, if there are multiple
instances where this same global is used for llvm.type.test, like with:

```
define i32 @fn(ptr %obj) #0 {
  %p = call i1 @llvm.type.test(ptr @vtable, metadata !"typeid2")
  call void @llvm.assume(i1 %p)
  %fptr = load ptr, ptr @vtable
  %result = call i32 %fptr(ptr %obj, i32 1)
  ret i32 %result
}

define i32 @fn2(ptr %obj) #0 {
  %p = call i1 @llvm.type.test(ptr @vtable, metadata !"typeid2")
  call void @llvm.assume(i1 %p)
  %fptr = load ptr, ptr @vtable
  %result = call i32 %fptr(ptr %obj, i32 1)
  ret i32 %result
}
```

Then each call base `%result = call i32 %fptr(ptr %obj, i32 1)` will be
added to the vector twice. This is because for either call base `%result
= call i32 %fptr(ptr %obj, i32 1) `, we determine it is associated with
a virtualizable call from `@vtable`, and then we iterate through all the
uses of `@vtable`, which is used across multiple functions. So when
scanning the first `%result = call i32 %fptr(ptr %obj, i32 1)`, then
both call bases will be added to the vector, but when scanning the
second one, both call bases are added again, resulting in duplicate call
bases in the CSInfo.CallSites vector.

Note this is actually accounted for in every other instance WPD iterates
over CallSites. What everything else does is actually add the call base
to the `OptimizedCalls` set and just check if it's already in the set.
We can't reuse that particular set since it serves a different purpose
marking which calls where devirtualized which `applyICallBranchFunnel`
explicitly says it doesn't. For this fix, we can just account for
duplicates with a map and do the actual replacements afterwards by
iterating over the map.

Differential Revision: https://reviews.llvm.org/D146267
---
 .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 19 +++++++-
 .../WholeProgramDevirt/branch-funnel.ll       | 48 +++++++++++++++++++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index e380b47c735fe..8224de30d6986 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1391,9 +1391,20 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
       IsExported = true;
     if (CSInfo.AllCallSitesDevirted)
       return;
+
+    std::map<CallBase *, CallBase *> CallBases;
     for (auto &&VCallSite : CSInfo.CallSites) {
       CallBase &CB = VCallSite.CB;
 
+      if (CallBases.find(&CB) != CallBases.end()) {
+        // When finding devirtualizable calls, it's possible to find the same
+        // vtable passed to multiple llvm.type.test or llvm.type.checked.load
+        // calls, which can cause duplicate call sites to be recorded in
+        // [Const]CallSites. If we've already found one of these
+        // call instances, just ignore it. It will be replaced later.
+        continue;
+      }
+
       // Jump tables are only profitable if the retpoline mitigation is enabled.
       Attribute FSAttr = CB.getCaller()->getFnAttribute("target-features");
       if (!FSAttr.isValid() ||
@@ -1440,8 +1451,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
           AttributeList::get(M.getContext(), Attrs.getFnAttrs(),
                              Attrs.getRetAttrs(), NewArgAttrs));
 
-      CB.replaceAllUsesWith(NewCS);
-      CB.eraseFromParent();
+      CallBases[&CB] = NewCS;
 
       // This use is no longer unsafe.
       if (VCallSite.NumUnsafeUses)
@@ -1451,6 +1461,11 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
     // retpoline mitigation, which would mean that they are lowered to
     // llvm.type.test and therefore require an llvm.type.test resolution for the
     // type identifier.
+
+    std::for_each(CallBases.begin(), CallBases.end(), [](auto &CBs) {
+      CBs.first->replaceAllUsesWith(CBs.second);
+      CBs.first->eraseFromParent();
+    });
   };
   Apply(SlotInfo.CSInfo);
   for (auto &P : SlotInfo.ConstCSInfo)
diff --git a/llvm/test/Transforms/WholeProgramDevirt/branch-funnel.ll b/llvm/test/Transforms/WholeProgramDevirt/branch-funnel.ll
index 4a6e3634a5d16..0b1023eee2732 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/branch-funnel.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/branch-funnel.ll
@@ -233,6 +233,54 @@ define i32 @fn3_rv(ptr %obj) #0 {
   ret i32 %result
 }
 
+; CHECK-LABEL: define i32 @fn4
+; CHECK-NOT: call void (...) @llvm.icall.branch.funnel
+define i32 @fn4(ptr %obj) #0 {
+  %p = call i1 @llvm.type.test(ptr @vt1_1, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptr = load ptr, ptr @vt1_1
+  ; RETP: call i32 @__typeid_typeid1_0_branch_funnel(ptr nest @vt1_1, ptr %obj, i32 1)
+  %result = call i32 %fptr(ptr %obj, i32 1)
+  ; NORETP: call i32 %
+  ret i32 %result
+}
+
+; CHECK-LABEL: define i32 @fn4_cpy
+; CHECK-NOT: call void (...) @llvm.icall.branch.funnel
+define i32 @fn4_cpy(ptr %obj) #0 {
+  %p = call i1 @llvm.type.test(ptr @vt1_1, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptr = load ptr, ptr @vt1_1
+  ; RETP: call i32 @__typeid_typeid1_0_branch_funnel(ptr nest @vt1_1, ptr %obj, i32 1)
+  %result = call i32 %fptr(ptr %obj, i32 1)
+  ; NORETP: call i32 %
+  ret i32 %result
+}
+
+; CHECK-LABEL: define i32 @fn4_rv
+; CHECK-NOT: call void (...) @llvm.icall.branch.funnel
+define i32 @fn4_rv(ptr %obj) #0 {
+  %p = call i1 @llvm.type.test(ptr @vt1_1_rv, metadata !"typeid1_rv")
+  call void @llvm.assume(i1 %p)
+  %fptr = call ptr @llvm.load.relative.i32(ptr @vt1_1_rv, i32 0)
+  ; RETP: call i32 @__typeid_typeid1_rv_0_branch_funnel(ptr nest @vt1_1_rv, ptr %obj, i32 1)
+  %result = call i32 %fptr(ptr %obj, i32 1)
+  ; NORETP: call i32 %
+  ret i32 %result
+}
+
+; CHECK-LABEL: define i32 @fn4_rv_cpy
+; CHECK-NOT: call void (...) @llvm.icall.branch.funnel
+define i32 @fn4_rv_cpy(ptr %obj) #0 {
+  %p = call i1 @llvm.type.test(ptr @vt1_1_rv, metadata !"typeid1_rv")
+  call void @llvm.assume(i1 %p)
+  %fptr = call ptr @llvm.load.relative.i32(ptr @vt1_1_rv, i32 0)
+  ; RETP: call i32 @__typeid_typeid1_rv_0_branch_funnel(ptr nest @vt1_1_rv, ptr %obj, i32 1)
+  %result = call i32 %fptr(ptr %obj, i32 1)
+  ; NORETP: call i32 %
+  ret i32 %result
+}
+
 ; CHECK-LABEL: define hidden void @__typeid_typeid1_0_branch_funnel(ptr nest %0, ...)
 ; CHECK-NEXT: musttail call void (...) @llvm.icall.branch.funnel(ptr %0, ptr {{(nonnull )?}}@vt1_1, ptr {{(nonnull )?}}@vf1_1, ptr {{(nonnull )?}}@vt1_2, ptr {{(nonnull )?}}@vf1_2, ...)
 

From 2e9bcadb7c8acaa8f6ec7d807e5666246923e468 Mon Sep 17 00:00:00 2001
From: Chia-hung Duan <chiahungduan@google.com>
Date: Thu, 23 Mar 2023 21:49:02 +0000
Subject: [PATCH 182/208] Revert "[scudo] Add a Timer class to assist
 performance measurement"

This reverts commit e0361396c2281a108a36d186161ace1843925431.
---
 .../lib/scudo/standalone/CMakeLists.txt       |   2 -
 .../lib/scudo/standalone/tests/CMakeLists.txt |   1 -
 .../scudo/standalone/tests/timing_test.cpp    |  86 -------
 compiler-rt/lib/scudo/standalone/timing.cpp   |  29 ---
 compiler-rt/lib/scudo/standalone/timing.h     | 215 ------------------
 5 files changed, 333 deletions(-)
 delete mode 100644 compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
 delete mode 100644 compiler-rt/lib/scudo/standalone/timing.cpp
 delete mode 100644 compiler-rt/lib/scudo/standalone/timing.h

diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
index 6fcd4deddf716..eefcffd4cfc56 100644
--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@@ -85,7 +85,6 @@ set(SCUDO_HEADERS
   stack_depot.h
   stats.h
   string_utils.h
-  timing.h
   tsd_exclusive.h
   tsd_shared.h
   tsd.h
@@ -108,7 +107,6 @@ set(SCUDO_SOURCES
   report.cpp
   rss_limit_checker.cpp
   string_utils.cpp
-  timing.cpp
   )
 
 # Enable the necessary instruction set for scudo_crc32.cpp, if available.
diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
index 335e4b7dbd899..50468d9c6ddc3 100644
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@@ -105,7 +105,6 @@ set(SCUDO_UNIT_TEST_SOURCES
   size_class_map_test.cpp
   stats_test.cpp
   strings_test.cpp
-  timing_test.cpp
   tsd_test.cpp
   vector_test.cpp
   scudo_unit_test_main.cpp
diff --git a/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp b/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
deleted file mode 100644
index 09a6c31224673..0000000000000
--- a/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-//===-- timing_test.cpp -----------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "tests/scudo_unit_test.h"
-
-#include "timing.h"
-
-#include <string>
-
-class ScudoTimingTest : public Test {
-public:
-  void testFunc1() { scudo::ScopedTimer ST(Manager, __func__); }
-
-  void testFunc2() {
-    scudo::ScopedTimer ST(Manager, __func__);
-    testFunc1();
-  }
-
-  void testChainedCalls() {
-    scudo::ScopedTimer ST(Manager, __func__);
-    testFunc2();
-  }
-
-  void testIgnoredTimer() {
-    scudo::ScopedTimer ST(Manager, __func__);
-    ST.ignore();
-  }
-
-  void printAllTimersStats() { Manager.printAll(); }
-
-  scudo::TimingManager &getTimingManager() { return Manager; }
-
-private:
-  scudo::TimingManager Manager;
-};
-
-// Given that the output of statistics of timers are dumped through
-// `scudo::Printf` which is platform dependent, so we don't have a reliable way
-// to catch the output and verify the details. Now we only verify the number of
-// invocations on linux.
-TEST_F(ScudoTimingTest, SimpleTimer) {
-#if SCUDO_LINUX
-  testing::internal::LogToStderr();
-  testing::internal::CaptureStderr();
-#endif
-
-  testIgnoredTimer();
-  testChainedCalls();
-  printAllTimersStats();
-
-#if SCUDO_LINUX
-  std::string output = testing::internal::GetCapturedStderr();
-  EXPECT_TRUE(output.find("testIgnoredTimer (1)") == std::string::npos);
-  EXPECT_TRUE(output.find("testChainedCalls (1)") != std::string::npos);
-  EXPECT_TRUE(output.find("testFunc2 (1)") != std::string::npos);
-  EXPECT_TRUE(output.find("testFunc1 (1)") != std::string::npos);
-#endif
-}
-
-TEST_F(ScudoTimingTest, NestedTimer) {
-#if SCUDO_LINUX
-  testing::internal::LogToStderr();
-  testing::internal::CaptureStderr();
-#endif
-
-  {
-    scudo::ScopedTimer Outer(getTimingManager(), "Outer");
-    {
-      scudo::ScopedTimer Inner1(getTimingManager(), Outer, "Inner1");
-      { scudo::ScopedTimer Inner2(getTimingManager(), Inner1, "Inner2"); }
-    }
-  }
-  printAllTimersStats();
-
-#if SCUDO_LINUX
-  std::string output = testing::internal::GetCapturedStderr();
-  EXPECT_TRUE(output.find("Outer (1)") != std::string::npos);
-  EXPECT_TRUE(output.find("Inner1 (1)") != std::string::npos);
-  EXPECT_TRUE(output.find("Inner2 (1)") != std::string::npos);
-#endif
-}
diff --git a/compiler-rt/lib/scudo/standalone/timing.cpp b/compiler-rt/lib/scudo/standalone/timing.cpp
deleted file mode 100644
index 59ae21d10f0f6..0000000000000
--- a/compiler-rt/lib/scudo/standalone/timing.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-//===-- timing.cpp ----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "timing.h"
-
-namespace scudo {
-
-Timer::~Timer() {
-  if (Manager)
-    Manager->report(*this);
-}
-
-ScopedTimer::ScopedTimer(TimingManager &Manager, const char *Name)
-    : Timer(Manager.getOrCreateTimer(Name)) {
-  start();
-}
-
-ScopedTimer::ScopedTimer(TimingManager &Manager, const Timer &Nest,
-                         const char *Name)
-    : Timer(Manager.nest(Nest, Name)) {
-  start();
-}
-
-} // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/timing.h b/compiler-rt/lib/scudo/standalone/timing.h
deleted file mode 100644
index 155111f9f8e52..0000000000000
--- a/compiler-rt/lib/scudo/standalone/timing.h
+++ /dev/null
@@ -1,215 +0,0 @@
-//===-- timing.h ------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "common.h"
-#include "mutex.h"
-#include "string_utils.h"
-#include "thread_annotations.h"
-
-#include <string.h>
-
-namespace scudo {
-
-class TimingManager;
-
-// A simple timer for evaluating execution time of code snippets. It can be used
-// along with TimingManager or standalone.
-class Timer {
-public:
-  // The use of Timer without binding to a TimingManager is supposed to do the
-  // timer logging manually. Otherwise, TimingManager will do the logging stuff
-  // for you.
-  Timer() = default;
-  Timer(Timer &&Other)
-      : StartTime(0), AccTime(Other.AccTime), Manager(Other.Manager),
-        HandleId(Other.HandleId) {
-    Other.Manager = nullptr;
-  }
-
-  Timer(const Timer &) = delete;
-
-  virtual ~Timer();
-
-  void start() {
-    CHECK_EQ(StartTime, 0U);
-    StartTime = getMonotonicTime();
-  }
-  void stop() {
-    AccTime += getMonotonicTime() - StartTime;
-    StartTime = 0;
-  }
-  u64 getAccumulatedTime() const { return AccTime; }
-
-  // Unset the bound TimingManager so that we don't report the data back. This
-  // is useful if we only want to track subset of certain scope events.
-  void ignore() {
-    StartTime = 0;
-    AccTime = 0;
-    Manager = nullptr;
-  }
-
-protected:
-  friend class TimingManager;
-  Timer(TimingManager &Manager, u32 HandleId)
-      : Manager(&Manager), HandleId(HandleId) {}
-
-  u64 StartTime = 0;
-  u64 AccTime = 0;
-  TimingManager *Manager = nullptr;
-  u32 HandleId;
-};
-
-// A RAII-style wrapper for easy scope execution measurement. Note that in order
-// not to take additional space for the message like `Name`. It only works with
-// TimingManager.
-class ScopedTimer : public Timer {
-public:
-  ScopedTimer(TimingManager &Manager, const char *Name);
-  ScopedTimer(TimingManager &Manager, const Timer &Nest, const char *Name);
-  ~ScopedTimer() override { stop(); }
-};
-
-// In Scudo, the execution time of single run of code snippets may not be
-// useful, we are more interested in the average time from several runs.
-// TimingManager lets the registered timer report their data and reports the
-// average execution time for each timer periodically.
-class TimingManager {
-public:
-  TimingManager(u32 PrintingInterval = DefaultPrintingInterval)
-      : PrintingInterval(PrintingInterval) {}
-  ~TimingManager() {
-    if (NumAllocatedTimers != 0)
-      printAll();
-  }
-
-  Timer getOrCreateTimer(const char *Name) EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-
-    CHECK_LT(strlen(Name), MaxLenOfTimerName);
-    for (u32 I = 0; I < NumAllocatedTimers; ++I) {
-      if (strncmp(Name, Timers[I].Name, MaxLenOfTimerName) == 0)
-        return Timer(*this, I);
-    }
-
-    CHECK_LT(NumAllocatedTimers, MaxNumberOfTimers);
-    strncpy(Timers[NumAllocatedTimers].Name, Name, MaxLenOfTimerName);
-    TimerRecords[NumAllocatedTimers].AccumulatedTime = 0;
-    TimerRecords[NumAllocatedTimers].Occurrence = 0;
-    return Timer(*this, NumAllocatedTimers++);
-  }
-
-  // Add a sub-Timer associated with another Timer. This is used when we want to
-  // detail the execution time in the scope of a Timer.
-  // For example,
-  //   void Foo() {
-  //     // T1 records the time spent in both first and second tasks.
-  //     ScopedTimer T1(getTimingManager(), "Task1");
-  //     {
-  //       // T2 records the time spent in first task
-  //       ScopedTimer T2(getTimingManager, T1, "Task2");
-  //       // Do first task.
-  //     }
-  //     // Do second task.
-  //   }
-  //
-  // The report will show proper indents to indicate the nested relation like,
-  //   -- Average Operation Time -- -- Name (# of Calls) --
-  //             10.0(ns)            Task1 (1)
-  //              5.0(ns)              Task2 (1)
-  Timer nest(const Timer &T, const char *Name) EXCLUDES(Mutex) {
-    CHECK_EQ(T.Manager, this);
-    Timer Nesting = getOrCreateTimer(Name);
-
-    ScopedLock L(Mutex);
-    CHECK_NE(Nesting.HandleId, T.HandleId);
-    Timers[Nesting.HandleId].Nesting = T.HandleId;
-    return Nesting;
-  }
-
-  void report(const Timer &T) EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-
-    const u32 HandleId = T.HandleId;
-    CHECK_LT(HandleId, MaxNumberOfTimers);
-    TimerRecords[HandleId].AccumulatedTime += T.getAccumulatedTime();
-    ++TimerRecords[HandleId].Occurrence;
-    ++NumEventsReported;
-    if (NumEventsReported % PrintingInterval == 0)
-      printAllImpl();
-  }
-
-  void printAll() EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-    printAllImpl();
-  }
-
-private:
-  void printAllImpl() REQUIRES(Mutex) {
-    static char NameHeader[] = "-- Name (# of Calls) --";
-    static char AvgHeader[] = "-- Average Operation Time --";
-    ScopedString Str;
-    Str.append("%-15s %-15s\n", AvgHeader, NameHeader);
-
-    for (u32 I = 0; I < NumAllocatedTimers; ++I) {
-      if (Timers[I].Nesting != MaxNumberOfTimers)
-        continue;
-      printImpl(Str, I);
-    }
-
-    Str.output();
-  }
-
-  void printImpl(ScopedString &Str, const u32 HandleId,
-                 const u32 ExtraIndent = 0) REQUIRES(Mutex) {
-    const uptr AccumulatedTime = TimerRecords[HandleId].AccumulatedTime;
-    const uptr Occurrence = TimerRecords[HandleId].Occurrence;
-    const uptr Integral = Occurrence == 0 ? 0 : AccumulatedTime / Occurrence;
-    // Only keep single digit of fraction is enough and it enables easier layout
-    // maintenance.
-    const uptr Fraction =
-        Occurrence == 0 ? 0
-                        : ((AccumulatedTime % Occurrence) * 10) / Occurrence;
-
-    Str.append("%14zu.%zu(ns) %-11s", Integral, Fraction, " ");
-
-    for (u32 I = 0; I < ExtraIndent; ++I)
-      Str.append("%s", "  ");
-    Str.append("%s (%zu)\n", Timers[HandleId].Name, Occurrence);
-
-    for (u32 I = 0; I < NumAllocatedTimers; ++I)
-      if (Timers[I].Nesting == HandleId)
-        printImpl(Str, I, ExtraIndent + 1);
-  }
-
-  // Instead of maintaining pages for timer registration, a static buffer is
-  // sufficient for most use cases in Scudo.
-  static constexpr u32 MaxNumberOfTimers = 50;
-  static constexpr u32 MaxLenOfTimerName = 50;
-  static constexpr u32 DefaultPrintingInterval = 100;
-
-  struct Record {
-    uptr AccumulatedTime = 0;
-    uptr Occurrence = 0;
-  };
-
-  struct TimerInfo {
-    char Name[MaxLenOfTimerName + 1];
-    u32 Nesting = MaxNumberOfTimers;
-  };
-
-  HybridMutex Mutex;
-  // The frequency of proactively dumping the timer statistics. For example, the
-  // default setting is to dump the statistics every 100 reported events.
-  u32 PrintingInterval GUARDED_BY(Mutex);
-  uptr NumEventsReported GUARDED_BY(Mutex) = 0;
-  u32 NumAllocatedTimers GUARDED_BY(Mutex) = 0;
-  TimerInfo Timers[MaxNumberOfTimers] GUARDED_BY(Mutex);
-  Record TimerRecords[MaxNumberOfTimers] GUARDED_BY(Mutex);
-};
-
-} // namespace scudo

From ec2333d88538c1675227a555140a13bc27aafb69 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 23 Mar 2023 14:43:23 -0700
Subject: [PATCH 183/208] [JITLink] Add a jitlink::Section::empty operation.

---
 .../llvm/ExecutionEngine/JITLink/JITLink.h     |  3 +++
 .../ExecutionEngine/JITLink/LinkGraphTests.cpp | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 3bc9bebea6e0b..464b21d536300 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -727,6 +727,9 @@ class Section {
   /// Returns the ordinal for this section.
   SectionOrdinal getOrdinal() const { return SecOrdinal; }
 
+  /// Returns true if this section is empty (contains no blocks or symbols).
+  bool empty() const { return Blocks.empty(); }
+
   /// Returns an iterator over the blocks defined in this section.
   iterator_range<block_iterator> blocks() {
     return make_range(Blocks.begin(), Blocks.end());
diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
index 0146c3b4cf6e0..a3cb1b6fd638b 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
@@ -92,6 +92,24 @@ TEST(LinkGraphTest, AddressAccess) {
   EXPECT_EQ(B1.getFixupAddress(E1), B1Addr + 8) << "Incorrect fixup address";
 }
 
+TEST(LinkGraphTest, SectionEmpty) {
+  // Check that Section::empty behaves as expected.
+  LinkGraph G("foo", Triple("x86_64-apple-darwin"), 8, support::little,
+              getGenericEdgeKindName);
+  auto &Sec1 =
+      G.createSection("__data.1", orc::MemProt::Read | orc::MemProt::Write);
+  auto &B =
+      G.createContentBlock(Sec1, BlockContent, orc::ExecutorAddr(0x1000), 8, 0);
+  G.addDefinedSymbol(B, 0, "S", 4, Linkage::Strong, Scope::Default, false,
+                     false);
+
+  auto &Sec2 =
+      G.createSection("__data.2", orc::MemProt::Read | orc::MemProt::Write);
+
+  EXPECT_FALSE(Sec1.empty());
+  EXPECT_TRUE(Sec2.empty());
+}
+
 TEST(LinkGraphTest, BlockAndSymbolIteration) {
   // Check that we can iterate over blocks within Sections and across sections.
   LinkGraph G("foo", Triple("x86_64-apple-darwin"), 8, support::little,

From 397486566e995a019c249784b1d07c53b6ac670d Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Thu, 23 Mar 2023 14:51:37 -0700
Subject: [PATCH 184/208] [llvm][TextAPI] Handle implicitly upgraded deployment
 versions

Sometimes the clang driver will receive a target triple where the
deployment version is too low to support the platform + arch. In those
cases, the compiler upgrades the final minOS which is what gets recorded
ultimately by the linker in LC_BUILD_VERSION. TextAPI should also reuse
this logic for capturing minOS in recorded TBDv5 files.

Reviewed By: ributzka

Differential Revision: https://reviews.llvm.org/D145690
---
 llvm/include/llvm/TextAPI/Platform.h       |  2 +
 llvm/include/llvm/TextAPI/Target.h         |  2 +-
 llvm/lib/TextAPI/Platform.cpp              |  7 ++++
 llvm/lib/TextAPI/TextStubV5.cpp            |  6 ++-
 llvm/unittests/TextAPI/TextStubV5Tests.cpp | 44 ++++++++++++++++++++++
 5 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/TextAPI/Platform.h b/llvm/include/llvm/TextAPI/Platform.h
index d4225ca533fc0..834f833306d1b 100644
--- a/llvm/include/llvm/TextAPI/Platform.h
+++ b/llvm/include/llvm/TextAPI/Platform.h
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/VersionTuple.h"
 
 namespace llvm {
 namespace MachO {
@@ -27,6 +28,7 @@ StringRef getPlatformName(PlatformType Platform);
 PlatformType getPlatformFromName(StringRef Name);
 std::string getOSAndEnvironmentName(PlatformType Platform,
                                     std::string Version = "");
+VersionTuple mapToSupportedOSVersion(const Triple &Triple);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/Target.h b/llvm/include/llvm/TextAPI/Target.h
index dc0e4f92ae802..0ab2783fc60c5 100644
--- a/llvm/include/llvm/TextAPI/Target.h
+++ b/llvm/include/llvm/TextAPI/Target.h
@@ -33,7 +33,7 @@ class Target {
       : Arch(Arch), Platform(Platform), MinDeployment(MinDeployment) {}
   explicit Target(const llvm::Triple &Triple)
       : Arch(mapToArchitecture(Triple)), Platform(mapToPlatformType(Triple)),
-        MinDeployment(Triple.getOSVersion()) {}
+        MinDeployment(mapToSupportedOSVersion(Triple)) {}
 
   static llvm::Expected<Target> create(StringRef Target);
 
diff --git a/llvm/lib/TextAPI/Platform.cpp b/llvm/lib/TextAPI/Platform.cpp
index 673fcb764bf86..a432462c82e33 100644
--- a/llvm/lib/TextAPI/Platform.cpp
+++ b/llvm/lib/TextAPI/Platform.cpp
@@ -132,5 +132,12 @@ std::string getOSAndEnvironmentName(PlatformType Platform,
   llvm_unreachable("Unknown llvm::MachO::PlatformType enum");
 }
 
+VersionTuple mapToSupportedOSVersion(const Triple &Triple) {
+  const VersionTuple MinSupportedOS = Triple.getMinimumSupportedOSVersion();
+  if (MinSupportedOS > Triple.getOSVersion())
+    return MinSupportedOS;
+  return Triple.getOSVersion();
+}
+
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/lib/TextAPI/TextStubV5.cpp b/llvm/lib/TextAPI/TextStubV5.cpp
index a9355fabe2202..ade4c867fa49d 100644
--- a/llvm/lib/TextAPI/TextStubV5.cpp
+++ b/llvm/lib/TextAPI/TextStubV5.cpp
@@ -293,8 +293,10 @@ Expected<TargetList> getTargetsSection(const Object *Section) {
     if (!TargetOrErr)
       return make_error<JSONStubError>(getParseErrorMsg(TBDKey::Target));
     TargetOrErr->MinDeployment = Version;
-
-    IFTargets.push_back(*TargetOrErr);
+    // Convert to LLVM::Triple to accurately compute minOS + platform + arch
+    // pairing.
+    IFTargets.push_back(
+        MachO::Target(Triple(getTargetTripleName(*TargetOrErr))));
   }
   return std::move(IFTargets);
 }
diff --git a/llvm/unittests/TextAPI/TextStubV5Tests.cpp b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
index 3deb38a5a0a3d..b4e8f513daee2 100644
--- a/llvm/unittests/TextAPI/TextStubV5Tests.cpp
+++ b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
@@ -944,6 +944,50 @@ TEST(TBDv5, Target_Simulator) {
   EXPECT_EQ(*File, *WriteResultFile);
 }
 
+TEST(TBDv5, Target_UnsupportedMinOS) {
+  static const char TBDv5File[] = R"({ 
+"tapi_tbd_version": 5,
+"main_library": {
+  "target_info": [
+    {
+      "target": "arm64-macos",
+      "min_deployment": "10.14"
+    },
+    {
+      "target": "x86_64-macos",
+      "min_deployment": "10.14" 
+    }
+  ],
+  "install_names":[
+    { "name":"/S/L/F/Foo.framework/Foo" }
+  ]
+}})";
+
+  Expected<TBDFile> Result =
+      TextAPIReader::get(MemoryBufferRef(TBDv5File, "Test.tbd"));
+  EXPECT_TRUE(!!Result);
+  TBDFile File = std::move(Result.get());
+  EXPECT_EQ(FileType::TBD_V5, File->getFileType());
+  TargetList ExpectedTargets = {
+      Target(AK_x86_64, PLATFORM_MACOS, VersionTuple(10, 14)),
+      Target(AK_arm64, PLATFORM_MACOS, VersionTuple(11, 0)),
+  };
+  TargetList Targets{File->targets().begin(), File->targets().end()};
+  llvm::sort(Targets);
+  EXPECT_EQ(Targets, ExpectedTargets);
+
+  SmallString<4096> Buffer;
+  raw_svector_ostream OS(Buffer);
+  Error WriteResult = TextAPIWriter::writeToStream(OS, *File);
+  EXPECT_TRUE(!WriteResult);
+
+  Expected<TBDFile> Output =
+      TextAPIReader::get(MemoryBufferRef(Buffer, "Output.tbd"));
+  EXPECT_TRUE(!!Output);
+  TBDFile WriteResultFile = std::move(Output.get());
+  EXPECT_EQ(*File, *WriteResultFile);
+}
+
 TEST(TBDv5, MisspelledKey) {
   static const char TBDv5File[] = R"({ 
 "tapi_tbd_version": 5,

From c13ccf1fbabede34ff28461b29d2d14aceb293fd Mon Sep 17 00:00:00 2001
From: NagaChaitanya Vellanki <pnagato@protonmail.com>
Date: Thu, 23 Mar 2023 14:38:37 -0700
Subject: [PATCH 185/208] [clang][ExtractAPI]Fix Declaration fragments for
 instancetype in the type position degrade to id

Fixes https://github.com/llvm/llvm-project/issues/61481

Reviewed By: dang

Differential Revision: https://reviews.llvm.org/D146671
---
 clang/lib/ExtractAPI/DeclarationFragments.cpp |  22 +-
 clang/test/ExtractAPI/objc_instancetype.m     | 254 ++++++++++++++++++
 2 files changed, 267 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/ExtractAPI/objc_instancetype.m

diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index b8de1270b5f02..c42a1de2fd358 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -243,26 +243,30 @@ DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForType(
     return Fragments.append(getFragmentsForType(ET->desugar(), Context, After));
   }
 
-  // Everything we care about has been handled now, reduce to the canonical
-  // unqualified base type.
-  QualType Base = T->getCanonicalTypeUnqualified();
-
-  // Render Objective-C `id`/`instancetype` as keywords.
-  if (T->isObjCIdType())
-    return Fragments.append(Base.getAsString(),
-                            DeclarationFragments::FragmentKind::Keyword);
-
   // If the type is a typedefed type, get the underlying TypedefNameDecl for a
   // direct reference to the typedef instead of the wrapped type.
+
+  // 'id' type is a typedef for an ObjCObjectPointerType
+  //  we treat it as a typedef
   if (const TypedefType *TypedefTy = dyn_cast<TypedefType>(T)) {
     const TypedefNameDecl *Decl = TypedefTy->getDecl();
     TypedefUnderlyingTypeResolver TypedefResolver(Context);
     std::string USR = TypedefResolver.getUSRForType(QualType(T, 0));
+
+    if (T->isObjCIdType()) {
+      return Fragments.append(Decl->getName(),
+                              DeclarationFragments::FragmentKind::Keyword);
+    }
+
     return Fragments.append(
         Decl->getName(), DeclarationFragments::FragmentKind::TypeIdentifier,
         USR, TypedefResolver.getUnderlyingTypeDecl(QualType(T, 0)));
   }
 
+  // Everything we care about has been handled now, reduce to the canonical
+  // unqualified base type.
+  QualType Base = T->getCanonicalTypeUnqualified();
+
   // If the base type is a TagType (struct/interface/union/class/enum), let's
   // get the underlying Decl for better names and USRs.
   if (const TagType *TagTy = dyn_cast<TagType>(Base)) {
diff --git a/clang/test/ExtractAPI/objc_instancetype.m b/clang/test/ExtractAPI/objc_instancetype.m
new file mode 100644
index 0000000000000..1680fe9336cf3
--- /dev/null
+++ b/clang/test/ExtractAPI/objc_instancetype.m
@@ -0,0 +1,254 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s@INPUT_DIR@%{/t:regex_replacement}@g" \
+             // RUN: %t/reference.output.json.in >> %t/reference.output.json
+// RUN: %clang_cc1 -extract-api -triple arm64-apple-macosx -x objective-c-header %t/input.h -o %t/output.json -verify
+
+// Generator version is not consistent across test runs, normalize it.
+// RUN: sed -e "s@\"generator\": \".*\"@\"generator\": \"?\"@g" \
+             // RUN: %t/output.json >> %t/output-normalized.json
+// RUN: diff %t/reference.output.json %t/output-normalized.json
+
+
+//--- input.h
+@interface Foo
+- (instancetype) init;
+- (id) reset;
+@end
+// expected-no-diagnostics
+
+
+//--- reference.output.json.in
+{
+  "metadata": {
+    "formatVersion": {
+      "major": 0,
+      "minor": 5,
+      "patch": 3
+    },
+    "generator": "?"
+  },
+  "module": {
+    "name": "",
+    "platform": {
+      "architecture": "arm64",
+      "operatingSystem": {
+        "minimumVersion": {
+          "major": 11,
+          "minor": 0,
+          "patch": 0
+        },
+        "name": "macosx"
+      },
+      "vendor": "apple"
+    }
+  },
+  "relationships": [
+    {
+      "kind": "memberOf",
+      "source": "c:objc(cs)Foo(im)init",
+      "target": "c:objc(cs)Foo",
+      "targetFallback": "Foo"
+    },
+    {
+      "kind": "memberOf",
+      "source": "c:objc(cs)Foo(im)reset",
+      "target": "c:objc(cs)Foo",
+      "targetFallback": "Foo"
+    }
+  ],
+  "symbols": [
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "keyword",
+          "spelling": "@interface"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "Foo"
+        }
+      ],
+      "identifier": {
+        "interfaceLanguage": "objective-c",
+        "precise": "c:objc(cs)Foo"
+      },
+      "kind": {
+        "displayName": "Class",
+        "identifier": "objective-c.class"
+      },
+      "location": {
+        "position": {
+          "character": 12,
+          "line": 1
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "Foo"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "Foo"
+          }
+        ],
+        "title": "Foo"
+      },
+      "pathComponents": [
+        "Foo"
+      ]
+    },
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "text",
+          "spelling": "- ("
+        },
+        {
+          "kind": "keyword",
+          "spelling": "instancetype"
+        },
+        {
+          "kind": "text",
+          "spelling": ") "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "init"
+        },
+        {
+          "kind": "text",
+          "spelling": ";"
+        }
+      ],
+      "functionSignature": {
+        "returns": [
+          {
+            "kind": "keyword",
+            "spelling": "instancetype"
+          }
+        ]
+      },
+      "identifier": {
+        "interfaceLanguage": "objective-c",
+        "precise": "c:objc(cs)Foo(im)init"
+      },
+      "kind": {
+        "displayName": "Instance Method",
+        "identifier": "objective-c.method"
+      },
+      "location": {
+        "position": {
+          "character": 1,
+          "line": 2
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "init"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "text",
+            "spelling": "- "
+          },
+          {
+            "kind": "identifier",
+            "spelling": "init"
+          }
+        ],
+        "title": "init"
+      },
+      "pathComponents": [
+        "Foo",
+        "init"
+      ]
+    },
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "text",
+          "spelling": "- ("
+        },
+        {
+          "kind": "keyword",
+          "spelling": "id"
+        },
+        {
+          "kind": "text",
+          "spelling": ") "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "reset"
+        },
+        {
+          "kind": "text",
+          "spelling": ";"
+        }
+      ],
+      "functionSignature": {
+        "returns": [
+          {
+            "kind": "keyword",
+            "spelling": "id"
+          }
+        ]
+      },
+      "identifier": {
+        "interfaceLanguage": "objective-c",
+        "precise": "c:objc(cs)Foo(im)reset"
+      },
+      "kind": {
+        "displayName": "Instance Method",
+        "identifier": "objective-c.method"
+      },
+      "location": {
+        "position": {
+          "character": 1,
+          "line": 3
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "reset"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "text",
+            "spelling": "- "
+          },
+          {
+            "kind": "identifier",
+            "spelling": "reset"
+          }
+        ],
+        "title": "reset"
+      },
+      "pathComponents": [
+        "Foo",
+        "reset"
+      ]
+    }
+  ]
+}

From 999643f1513e86d7d438ec953a3d73c4bc21eb25 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 22 Mar 2023 19:14:00 -0700
Subject: [PATCH 186/208] [WebAssembly] Tidy up DebugValueManager  (NFC)

Misc. cleanups for `WebAssemblyDebugValueManager`.
- Use `Register` for registers
- Simpler for loop iteration
- Rename a variable
- Reorder methods
- Reduce `SmallVector` size for `DBG_VALUE`s to 1; one def usually have
  a single `DBG_VALUE` attached to it in most cases
- Add a few more lines of comments

Reviewed By: dschuff

Differential Revision: https://reviews.llvm.org/D146743
---
 .../WebAssemblyDebugValueManager.cpp          | 35 +++++++++----------
 .../WebAssemblyDebugValueManager.h            | 17 ++++++---
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
index 55be64ad7da01..45502a577e4e2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
@@ -18,21 +18,18 @@
 
 using namespace llvm;
 
-WebAssemblyDebugValueManager::WebAssemblyDebugValueManager(
-    MachineInstr *Instr) {
+WebAssemblyDebugValueManager::WebAssemblyDebugValueManager(MachineInstr *Def) {
   // This code differs from MachineInstr::collectDebugValues in that it scans
   // the whole BB, not just contiguous DBG_VALUEs.
-  if (!Instr->getOperand(0).isReg())
+  if (!Def->getOperand(0).isReg())
     return;
-  CurrentReg = Instr->getOperand(0).getReg();
+  CurrentReg = Def->getOperand(0).getReg();
 
-  MachineBasicBlock::iterator DI = *Instr;
-  ++DI;
-  for (MachineBasicBlock::iterator DE = Instr->getParent()->end(); DI != DE;
-       ++DI) {
-    if (DI->isDebugValue() &&
-        DI->hasDebugOperandForReg(Instr->getOperand(0).getReg()))
-      DbgValues.push_back(&*DI);
+  for (MachineBasicBlock::iterator MI = std::next(Def->getIterator()),
+                                   ME = Def->getParent()->end();
+       MI != ME; ++MI) {
+    if (MI->isDebugValue() && MI->hasDebugOperandForReg(CurrentReg))
+      DbgValues.push_back(&*MI);
   }
 }
 
@@ -42,15 +39,8 @@ void WebAssemblyDebugValueManager::move(MachineInstr *Insert) {
     MBB->splice(Insert, DBI->getParent(), DBI);
 }
 
-void WebAssemblyDebugValueManager::updateReg(unsigned Reg) {
-  for (auto *DBI : DbgValues)
-    for (auto &MO : DBI->getDebugOperandsForReg(CurrentReg))
-      MO.setReg(Reg);
-  CurrentReg = Reg;
-}
-
 void WebAssemblyDebugValueManager::clone(MachineInstr *Insert,
-                                         unsigned NewReg) {
+                                         Register NewReg) {
   MachineBasicBlock *MBB = Insert->getParent();
   MachineFunction *MF = MBB->getParent();
   for (MachineInstr *DBI : reverse(DbgValues)) {
@@ -61,6 +51,13 @@ void WebAssemblyDebugValueManager::clone(MachineInstr *Insert,
   }
 }
 
+void WebAssemblyDebugValueManager::updateReg(Register Reg) {
+  for (auto *DBI : DbgValues)
+    for (auto &MO : DBI->getDebugOperandsForReg(CurrentReg))
+      MO.setReg(Reg);
+  CurrentReg = Reg;
+}
+
 void WebAssemblyDebugValueManager::replaceWithLocal(unsigned LocalId) {
   for (auto *DBI : DbgValues) {
     auto IndexType = DBI->isIndirectDebugValue()
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h b/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
index c2dd569093044..4c63af21406e1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
@@ -9,6 +9,9 @@
 /// \file
 /// This file contains the declaration of the WebAssembly-specific
 /// manager for DebugValues associated with the specific MachineInstr.
+/// This pass currently does not handle DBG_VALUE_LISTs; they are assumed to
+/// have been set to undef in NullifyDebugValueLists pass.
+/// TODO Handle DBG_VALUE_LIST
 ///
 //===----------------------------------------------------------------------===//
 
@@ -16,21 +19,25 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYDEBUGVALUEMANAGER_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Register.h"
 
 namespace llvm {
 
 class MachineInstr;
 
 class WebAssemblyDebugValueManager {
-  SmallVector<MachineInstr *, 2> DbgValues;
-  unsigned CurrentReg;
+  SmallVector<MachineInstr *, 1> DbgValues;
+  Register CurrentReg;
 
 public:
-  WebAssemblyDebugValueManager(MachineInstr *Instr);
+  WebAssemblyDebugValueManager(MachineInstr *Def);
 
   void move(MachineInstr *Insert);
-  void updateReg(unsigned Reg);
-  void clone(MachineInstr *Insert, unsigned NewReg);
+  void clone(MachineInstr *Insert, Register NewReg);
+  // Update the register for Def and DBG_VALUEs.
+  void updateReg(Register Reg);
+  // Replace the current register in DBG_VALUEs with the given LocalId target
+  // index.
   void replaceWithLocal(unsigned LocalId);
 };
 

From bb0ecb7bf0d0025e61086ae449dae099a8a8bf14 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 23 Mar 2023 15:49:38 -0700
Subject: [PATCH 187/208] [Driver][test] Remove remnant mips*-linux-android
 tests after 805f51f9fedf90d2aa0ad46c61cb4c9c0c5bcfe9

---
 clang/test/Driver/clang-translation.c | 7 -------
 clang/test/Driver/constructors.c      | 6 ------
 clang/test/Driver/linux-ld.c          | 5 -----
 3 files changed, 18 deletions(-)

diff --git a/clang/test/Driver/clang-translation.c b/clang/test/Driver/clang-translation.c
index 058ac32bbdb50..d950d9a4de9be 100644
--- a/clang/test/Driver/clang-translation.c
+++ b/clang/test/Driver/clang-translation.c
@@ -483,10 +483,3 @@
 // MIPSN32R6EL: "-target-cpu" "mips64r6"
 // MIPSN32R6EL: "-target-abi" "n32"
 // MIPSN32R6EL: "-mfloat-abi" "hard"
-
-// RUN: %clang -target mips64el-linux-android -### -S %s 2>&1 | \
-// RUN: FileCheck -check-prefix=MIPS64EL-ANDROID %s
-// MIPS64EL-ANDROID: clang
-// MIPS64EL-ANDROID: "-cc1"
-// MIPS64EL-ANDROID: "-target-cpu" "mips64r6"
-// MIPS64EL-ANDROID: "-mfloat-abi" "hard"
diff --git a/clang/test/Driver/constructors.c b/clang/test/Driver/constructors.c
index f844e80a5450f..f210ad512f270 100644
--- a/clang/test/Driver/constructors.c
+++ b/clang/test/Driver/constructors.c
@@ -50,12 +50,6 @@
 // RUN:   | FileCheck --check-prefix=CHECK-INIT-ARRAY %s
 //
 // RUN: %clang -### %s -fsyntax-only 2>&1       \
-// RUN:     --target=mipsel-unknown-linux-android \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:     --gcc-toolchain="" \
-// RUN:   | FileCheck --check-prefix=CHECK-INIT-ARRAY %s
-//
-// RUN: %clang -### %s -fsyntax-only 2>&1       \
 // RUN:     --target=i386-unknown-linux-android \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     --gcc-toolchain="" \
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index be1230ac0ab63..27786dce67cc6 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -1223,11 +1223,6 @@
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-64 %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=mips64el-linux-android \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-64 %s
-// RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-linux-android \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \

From ccc2f362db352df8991f493d8a05bdf99eaeea4b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 23 Mar 2023 15:58:42 -0700
Subject: [PATCH 188/208] Android.rules: remove mips* rules

They have been obsoleted for a long time and D146565 recently removed
Clang support.
---
 lldb/packages/Python/lldbsuite/test/make/Android.rules | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/make/Android.rules b/lldb/packages/Python/lldbsuite/test/make/Android.rules
index 32f786aa34756..cd7d8ae74d6bf 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Android.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Android.rules
@@ -24,14 +24,6 @@ else ifeq "$(ARCH)" "i386"
 	SYSROOT_ARCH := x86
 	STL_ARCH := x86
 	TRIPLE := i686-none-linux-android
-else ifeq "$(ARCH)" "mips64r6"
-	SYSROOT_ARCH := mips64
-	STL_ARCH := mips64
-	TRIPLE := mips64el-none-linux-android
-else ifeq "$(ARCH)" "mips32"
-	SYSROOT_ARCH := mips
-	STL_ARCH := mips
-	TRIPLE := mipsel-none-linux-android
 else
 	SYSROOT_ARCH := $(ARCH)
 	STL_ARCH := $(ARCH)

From 399f313f271342d1d838bf396af5c5d8d587915a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 23 Mar 2023 16:00:16 -0700
Subject: [PATCH 189/208] [Driver] Remove remnant mips64el-linux-android code
 after D146565

---
 clang/lib/Driver/ToolChains/Arch/Mips.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index 7da00a8854006..f9f14c01b2b9f 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -39,12 +39,6 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
     DefMips64CPU = "mips64r6";
   }
 
-  // MIPS64r6 is the default for Android MIPS64 (mips64el-linux-android).
-  if (Triple.isAndroid()) {
-    DefMips32CPU = "mips32";
-    DefMips64CPU = "mips64r6";
-  }
-
   // MIPS3 is the default for mips64*-unknown-openbsd.
   if (Triple.isOSOpenBSD())
     DefMips64CPU = "mips3";

From 07ef7b1ff21e8e3faaf8279b8ec6a7f0ac252fad Mon Sep 17 00:00:00 2001
From: Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Date: Thu, 23 Mar 2023 14:34:12 -0700
Subject: [PATCH 190/208] [Builtins] Add __builtin_assume_separate_storage

Plumbing from the language level to the assume intrinsics with
separate_storage operand bundles.

Patch by David Goldblatt (davidtgoldblatt)

Differential Revision: https://reviews.llvm.org/D136515
---
 clang/docs/LanguageExtensions.rst             | 40 +++++++++++++++++++
 clang/docs/ReleaseNotes.rst                   |  2 +
 clang/include/clang/Basic/Builtins.def        |  1 +
 clang/lib/CodeGen/CGBuiltin.cpp               | 12 ++++++
 .../CodeGen/builtin-assume-separate-storage.c | 36 +++++++++++++++++
 .../Sema/builtin-assume-separate-storage.c    | 13 ++++++
 6 files changed, 104 insertions(+)
 create mode 100644 clang/test/CodeGen/builtin-assume-separate-storage.c
 create mode 100644 clang/test/Sema/builtin-assume-separate-storage.c

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index f8c83d4d6d162..a9bdc83c53e7a 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2358,6 +2358,46 @@ evaluated, so any side effects of the expression will be discarded.
 
 Query for this feature with ``__has_builtin(__builtin_assume)``.
 
+.. _langext-__builtin_assume_separate_storage:
+
+``__builtin_assume_separate_storage``
+--------------------
+
+``__builtin_assume_separate_storage`` is used to provide the optimizer with the
+knowledge that its two arguments point to separately allocated objects.
+
+**Syntax**:
+
+.. code-block:: c++
+
+    __builtin_assume_separate_storage(const volatile void *, const volatile void *)
+
+**Example of Use**:
+
+.. code-block:: c++
+
+  int foo(int *x, int *y) {
+      __builtin_assume_separate_storage(x, y);
+      *x = 0;
+      *y = 1;
+      // The optimizer may optimize this to return 0 without reloading from *x.
+      return *x;
+  }
+
+**Description**:
+
+The arguments to this function are assumed to point into separately allocated
+storage (either different variable definitions or different dynamic storage
+allocations). The optimizer may use this fact to aid in alias analysis. If the
+arguments point into the same storage, the behavior is undefined. Note that the
+definition of "storage" here refers to the outermost enclosing allocation of any
+particular object (so for example, it's never correct to call this function
+passing the addresses of fields in the same struct, elements of the same array,
+etc.).
+
+Query for this feature with ``__has_builtin(__builtin_assume_separate_storage)``.
+
+
 ``__builtin_offsetof``
 ----------------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index faac3b17b223f..29e3f516c06e5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -124,6 +124,8 @@ Non-comprehensive list of changes in this release
 - Clang now supports ``__builtin_FILE_NAME()`` which returns the same
   information as the ``__FILE_NAME__`` macro (the presumed file name
   from the invocation point, with no path components included).
+- Clang now supports ``__builtin_assume_separate_storage`` that indicates that
+  its arguments point to objects in separate storage allocations.
 
 New Compiler Flags
 ------------------
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 957375eccb84a..dea806099efbf 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -1591,6 +1591,7 @@ BUILTIN(__builtin_annotation, "v.", "tn")
 
 // Invariants
 BUILTIN(__builtin_assume, "vb", "nE")
+BUILTIN(__builtin_assume_separate_storage, "vvCD*vCD*", "nE")
 
 // Multiprecision Arithmetic Builtins.
 BUILTIN(__builtin_addcb, "UcUcCUcCUcCUc*", "n")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6381d68c161c6..b3aea13878c1c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2856,6 +2856,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     Builder.CreateCall(FnAssume, ArgValue);
     return RValue::get(nullptr);
   }
+  case Builtin::BI__builtin_assume_separate_storage: {
+    const Expr *Arg0 = E->getArg(0);
+    const Expr *Arg1 = E->getArg(1);
+
+    Value *Value0 = EmitScalarExpr(Arg0);
+    Value *Value1 = EmitScalarExpr(Arg1);
+
+    Value *Values[] = {Value0, Value1};
+    OperandBundleDefT<Value *> OBD("separate_storage", Values);
+    Builder.CreateAssumption(ConstantInt::getTrue(getLLVMContext()), {OBD});
+    return RValue::get(nullptr);
+  }
   case Builtin::BI__arithmetic_fence: {
     // Create the builtin call if FastMath is selected, and the target
     // supports the builtin, otherwise just return the argument.
diff --git a/clang/test/CodeGen/builtin-assume-separate-storage.c b/clang/test/CodeGen/builtin-assume-separate-storage.c
new file mode 100644
index 0000000000000..ac82f27b3e720
--- /dev/null
+++ b/clang/test/CodeGen/builtin-assume-separate-storage.c
@@ -0,0 +1,36 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+void *nonconst(void);
+
+// CHECK-LABEL: @test1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[B:%.*]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "separate_storage"(ptr [[TMP0]], ptr [[TMP1]]) ]
+// CHECK-NEXT:    ret void
+//
+void test1(int *a, int *b) {
+
+  __builtin_assume_separate_storage(a, b);
+}
+
+// Separate storage assumptions evaluate their arguments unconditionally, like
+// assume_aligned but *unlike* assume. Check that we actually do so.
+// CHECK-LABEL: @test2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[B:%.*]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call ptr @nonconst()
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "separate_storage"(ptr [[TMP0]], ptr [[CALL]]) ]
+// CHECK-NEXT:    ret void
+//
+void test2(int *a, int *b) {
+  __builtin_assume_separate_storage(a, nonconst());
+}
diff --git a/clang/test/Sema/builtin-assume-separate-storage.c b/clang/test/Sema/builtin-assume-separate-storage.c
new file mode 100644
index 0000000000000..f27d0b821d5ab
--- /dev/null
+++ b/clang/test/Sema/builtin-assume-separate-storage.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify %s
+
+void *nonconst(void);
+
+void test1(int *a, int *b) {
+  __builtin_assume_separate_storage(a, b);
+  // Separate storage assumptions evaluate their arguments unconditionally, like
+  // assume_aligned but *unlike* assume. Check that we don't warn on it.
+  __builtin_assume_separate_storage(a, nonconst());
+  __builtin_assume_separate_storage(nonconst(), a);
+  __builtin_assume_separate_storage(a, 3); // expected-error {{incompatible integer to pointer conversion}}
+  __builtin_assume_separate_storage(3, a); // expected-error {{incompatible integer to pointer conversion}}
+}

From c82803097f6a89edc49577e5bb4f7309e053efcc Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Thu, 23 Feb 2023 11:30:20 -0500
Subject: [PATCH 191/208] [mlir][linalg] Refactor convolution to img2col
 conversion to use gather semantics

Following up on the comments in https://reviews.llvm.org/D144108 this
patch refactors the im2col conversion patterns for `linalg.conv_2d_nhwc_hwcf`
and `linalg.conv_2d_nchw_fchw` convolutions to use gather semantics for the im2col
packing `linalg.generic`.

Follow up work can include a similar pattern for depthwise convolutions
and a generalization of the patterns here to work with any `LinalgOp` as
well.

Differential Revision: https://reviews.llvm.org/D144678
---
 .../Transforms/ConvertConv2DToImg2Col.cpp     | 372 +++++++++---------
 .../Linalg/convert-conv2d-to-img2col.mlir     | 150 ++++---
 2 files changed, 297 insertions(+), 225 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
index 14bff411ef8c1..58a23e2be54d1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
@@ -41,6 +41,49 @@ static Value createMul(Location loc, Value x, Value y, OpBuilder &builder) {
   return builder.create<arith::MulFOp>(loc, x, y);
 }
 
+// Unrolls the given composite `index` into a set of subindices with maximum
+// iteration ranges specified by `factors` according to the following
+// assumptions:
+//   1. The iteration range for `index` is [0, f1 * f2 * ... * fn] i.e. the
+//   product of the given list of factors
+//   2. The iterators corresponding to the entries in `factors` are ordered from
+//   slowest to fastest varying
+// Each subindex is then computed as:
+//    subindex[i] = floor( (index % (fi * ... * fn)) / (fi-1 * ... * fn) )
+static SmallVector<Value, 3> unrollIndex(OpBuilder &b, Location loc,
+                                         Value index,
+                                         ArrayRef<int64_t> factors) {
+  assert(factors.size() >= 1 && "empty factor list");
+  SmallVector<Value, 3> indices(factors.size());
+  int64_t runningProd = 1;
+  for (int i = factors.size() - 1, end = 0; i >= end; i--) {
+    Value unrolledIndex = index;
+    if (i > 0) {
+      Value modBase = b.create<arith::ConstantOp>(
+          loc, b.getIndexAttr(runningProd * factors[i]));
+      unrolledIndex = b.create<arith::RemUIOp>(loc, unrolledIndex, modBase);
+    }
+    if (runningProd > 1) {
+      Value divDenom =
+          b.create<arith::ConstantOp>(loc, b.getIndexAttr(runningProd));
+      unrolledIndex = b.create<arith::DivUIOp>(loc, unrolledIndex, divDenom);
+    }
+    runningProd *= factors[i];
+    indices[i] = unrolledIndex;
+  }
+  return indices;
+}
+
+// Given indices corresponding to iterators in the output (oIndex) and filter
+// (fIndex) for a convolution, compute the convolved index for the
+// input as `oIndex * stride + fIndex`.
+static Value getConvolvedIndex(OpBuilder &b, Location loc, Value oIndex,
+                               Value fIndex, int64_t stride) {
+  Value strideVal = b.create<arith::ConstantOp>(loc, b.getIndexAttr(stride));
+  Value convIndex = b.create<arith::MulIOp>(loc, oIndex, strideVal);
+  return b.create<arith::AddIOp>(loc, convIndex, fIndex);
+}
+
 FailureOr<std::pair<Operation *, Operation *>>
 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp) {
   auto inputType = convOp.getInputs()[0].getType().cast<ShapedType>();
@@ -68,32 +111,34 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp) {
   ArrayRef<int64_t> filterShape = filterType.getShape();
   ArrayRef<int64_t> outputShape = outputType.getShape();
 
-  int n = outputShape[0];
-  int oh = outputShape[1];
-  int ow = outputShape[2];
-  int oc = outputShape[3];
-  int fh = filterShape[0];
-  int fw = filterShape[1];
-  int ic = filterShape[2];
+  int64_t n = outputShape[0];
+  int64_t oh = outputShape[1];
+  int64_t ow = outputShape[2];
+  int64_t oc = outputShape[3];
+  int64_t fh = filterShape[0];
+  int64_t fw = filterShape[1];
+  int64_t ic = filterShape[2];
 
   Location loc = convOp.getLoc();
 
-  SmallVector<int64_t> colTensorShape = {n, oh, ow, fh, fw, ic};
+  // Reshape output and filter to the LHS and result of a (B)MNK matmul.
+  SmallVector<ReassociationIndices> filterReassocIndices = {{0, 1, 2}, {3}};
+  auto reshapedFilterType =
+      RankedTensorType::get({fh * fw * ic, oc}, inputType.getElementType());
+  Value reshapedFilter = rewriter.create<tensor::CollapseShapeOp>(
+      loc, reshapedFilterType, filter, filterReassocIndices);
+
+  SmallVector<ReassociationIndices> outputReassocIndices = {{0}, {1, 2}, {3}};
+  RankedTensorType reshapedOutputType =
+      RankedTensorType::get({n, oh * ow, oc}, outputType.getElementType());
+  Value reshapedOutput = rewriter.create<tensor::CollapseShapeOp>(
+      loc, reshapedOutputType, output, outputReassocIndices);
 
+  SmallVector<int64_t> colTensorShape = {n, oh * ow, fh * fw * ic};
   Value colTensor = rewriter.create<tensor::EmptyOp>(
       loc, colTensorShape, inputType.getElementType());
 
-  AffineExpr nDim, ohDim, owDim, khDim, kwDim, icDim;
-  bindDims(context, nDim, ohDim, owDim, khDim, kwDim, icDim);
-
-  AffineExpr shSym = rewriter.getAffineConstantExpr(
-      convOp.getStrides().getValues<int64_t>()[0]);
-  AffineExpr swSym = rewriter.getAffineConstantExpr(
-      convOp.getStrides().getValues<int64_t>()[1]);
-
-  SmallVector<AffineExpr> inputExprs = {nDim, ohDim * shSym + khDim,
-                                        owDim * swSym + kwDim, icDim};
-
+  // Convert the input to a (BMK) column tensor.
   auto nloops = colTensorShape.size();
 
   auto parallel = utils::IteratorType::parallel;
@@ -101,85 +146,68 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp) {
   SmallVector<utils::IteratorType> img2colIterators(nloops, parallel);
 
   SmallVector<AffineMap> img2colIndexingMaps = {
-      AffineMap::get(nloops, 0, inputExprs, context),
       AffineMap::getMultiDimIdentityMap(nloops, context)};
 
   auto img2ColTensor = rewriter.create<linalg::GenericOp>(
       loc, colTensor.getType(),
-      /*inputs=*/input, /*outputs=*/colTensor, img2colIndexingMaps,
+      /*inputs=*/ValueRange{}, /*outputs=*/colTensor, img2colIndexingMaps,
       img2colIterators,
       [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
-        nestedBuilder.create<linalg::YieldOp>(nestedLoc, args[0]);
+        // Get the iterators named based on the matmul (batch, m, k).
+        Value bIndex = nestedBuilder.create<linalg::IndexOp>(loc, 0);
+        Value mIndex = nestedBuilder.create<linalg::IndexOp>(loc, 1);
+        Value kIndex = nestedBuilder.create<linalg::IndexOp>(loc, 2);
+
+        // Recover the original iteration indices from the problem/input sizes.
+        SmallVector<Value, 3> mIndices = unrollIndex(
+            nestedBuilder, nestedLoc, mIndex, ArrayRef<int64_t>{oh, ow});
+        auto ohIndex = mIndices[0];
+        auto owIndex = mIndices[1];
+
+        SmallVector<Value, 3> kIndices = unrollIndex(
+            nestedBuilder, nestedLoc, kIndex, ArrayRef<int64_t>{fh, fw, ic});
+        auto fhIndex = kIndices[0];
+        auto fwIndex = kIndices[1];
+        auto icIndex = kIndices[2];
+
+        // Extract the input element corresponding to the expanded indices.
+        Value hIndex =
+            getConvolvedIndex(nestedBuilder, nestedLoc, ohIndex, fhIndex,
+                              convOp.getStrides().getValues<int64_t>()[0]);
+        Value wIndex =
+            getConvolvedIndex(nestedBuilder, nestedLoc, owIndex, fwIndex,
+                              convOp.getStrides().getValues<int64_t>()[1]);
+
+        // im2col[n, oh*ow, fh*fw*ic] = input[n, sh*oh + fh, sw*ow + fw, ic]
+        SmallVector<Value> extractionIndices{bIndex, hIndex, wIndex, icIndex};
+        Value inputVal = nestedBuilder.create<tensor::ExtractOp>(
+            loc, input, extractionIndices);
+        nestedBuilder.create<linalg::YieldOp>(nestedLoc, inputVal);
       });
 
-  SmallVector<ReassociationIndices> img2ColTensorReassocIndices;
-  SmallVector<ReassociationIndices> outputReassocIndices;
-  RankedTensorType reshapedImg2ColTensorType, reshapedOutputType;
-  if (n == 1) {
-    img2ColTensorReassocIndices = {{0, 1, 2}, {3, 4, 5}};
-    outputReassocIndices = {{0, 1, 2}, {3}};
-
-    reshapedImg2ColTensorType = RankedTensorType::get(
-        {oh * ow, fh * fw * ic}, inputType.getElementType());
-    reshapedOutputType =
-        RankedTensorType::get({oh * ow, oc}, outputType.getElementType());
-  } else {
-    img2ColTensorReassocIndices = {{0}, {1, 2}, {3, 4, 5}};
-    outputReassocIndices = {{0}, {1, 2}, {3}};
-
-    reshapedImg2ColTensorType = RankedTensorType::get(
-        {n, oh * ow, fh * fw * ic}, inputType.getElementType());
-    reshapedOutputType =
-        RankedTensorType::get({n, oh * ow, oc}, outputType.getElementType());
-  }
-
-  SmallVector<ReassociationIndices> filterReassocIndices = {{0, 1, 2}, {3}};
-  auto reshapedFilterType =
-      RankedTensorType::get({fh * fw * ic, oc}, inputType.getElementType());
-
-  Value reshapedImg2ColTensor = rewriter.create<tensor::CollapseShapeOp>(
-      loc, reshapedImg2ColTensorType, img2ColTensor.getResult(0),
-      img2ColTensorReassocIndices);
-
-  Value reshapedFilter = rewriter.create<tensor::CollapseShapeOp>(
-      loc, reshapedFilterType, filter, filterReassocIndices);
-
-  Value reshapedOutput = rewriter.create<tensor::CollapseShapeOp>(
-      loc, reshapedOutputType, output, outputReassocIndices);
-
-  Value result;
-  if (n == 1) {
-    auto matmulOp = rewriter.create<linalg::MatmulOp>(
-        loc, reshapedOutputType,
-        ArrayRef<Value>{reshapedImg2ColTensor, reshapedFilter},
-        ArrayRef<Value>{reshapedOutput});
-    result = matmulOp.getResults().front();
-  } else {
-    // For cases where batch is not 1, we need to keep the batch dimension
-    // separate. Because the filter does not share the same batch dimension,
-    // the batch dimension is only used in indexing the input and output. Thus
-    // we cannot use existing linalg named ops like linalg.batch_matmul.
-    // i.e. (B x) M x K * K x N = (B x) M x N
-    AffineExpr bDim, mDim, nDim, kDim;
-    bindDims(context, bDim, mDim, nDim, kDim);
-    auto lhsMap = AffineMap::get(4, 0, {bDim, mDim, kDim}, context);
-    auto rhsMap = AffineMap::get(4, 0, {kDim, nDim}, context);
-    auto resultMap = AffineMap::get(4, 0, {bDim, mDim, nDim}, context);
-    SmallVector<utils::IteratorType> genericIterators = {parallel, parallel,
-                                                         parallel, reduction};
-
-    auto genericOp = rewriter.create<linalg::GenericOp>(
-        loc, reshapedOutputType,
-        /*inputs=*/ValueRange{reshapedImg2ColTensor, reshapedFilter},
-        /*outputs=*/ValueRange{reshapedOutput},
-        ArrayRef<AffineMap>{lhsMap, rhsMap, resultMap}, genericIterators,
-        [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
-          Value mul = createMul(loc, args[0], args[1], nestedBuilder);
-          Value add = createAdd(loc, mul, args[2], nestedBuilder);
-          nestedBuilder.create<linalg::YieldOp>(nestedLoc, add);
-        });
-    result = genericOp.getResults().front();
-  }
+  // Because the filter does not share the same batch dimension,
+  // the batch dimension is only used in indexing the input and output. Thus
+  // we cannot use existing linalg named ops like linalg.batch_matmul.
+  // i.e. (B x) M x K * K x N = (B x) M x N
+  AffineExpr bDim, mDim, nDim, kDim;
+  bindDims(context, bDim, mDim, nDim, kDim);
+  auto lhsMap = AffineMap::get(4, 0, {bDim, mDim, kDim}, context);
+  auto rhsMap = AffineMap::get(4, 0, {kDim, nDim}, context);
+  auto resultMap = AffineMap::get(4, 0, {bDim, mDim, nDim}, context);
+  SmallVector<utils::IteratorType> genericIterators = {parallel, parallel,
+                                                       parallel, reduction};
+
+  auto genericOp = rewriter.create<linalg::GenericOp>(
+      loc, reshapedOutputType,
+      /*inputs=*/ValueRange{img2ColTensor.getResult(0), reshapedFilter},
+      /*outputs=*/ValueRange{reshapedOutput},
+      ArrayRef<AffineMap>{lhsMap, rhsMap, resultMap}, genericIterators,
+      [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
+        Value mul = createMul(loc, args[0], args[1], nestedBuilder);
+        Value add = createAdd(loc, mul, args[2], nestedBuilder);
+        nestedBuilder.create<linalg::YieldOp>(nestedLoc, add);
+      });
+  Value result = genericOp.getResults().front();
 
   auto reshapedResult = rewriter.create<tensor::ExpandShapeOp>(
       loc, outputType, result, outputReassocIndices);
@@ -367,33 +395,33 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp) {
   auto filterShape = filterType.getShape();
   auto outputShape = outputType.getShape();
 
-  int n = outputShape[0];
-  int oc = outputShape[1];
-  int oh = outputShape[2];
-  int ow = outputShape[3];
-  int ic = filterShape[1];
-  int fh = filterShape[2];
-  int fw = filterShape[3];
+  int64_t n = outputShape[0];
+  int64_t oc = outputShape[1];
+  int64_t oh = outputShape[2];
+  int64_t ow = outputShape[3];
+  int64_t ic = filterShape[1];
+  int64_t fh = filterShape[2];
+  int64_t fw = filterShape[3];
 
   auto loc = convOp.getLoc();
-
-  SmallVector<int64_t, 4> colTensorShape = {n, ic, fh, fw, oh, ow};
-
-  Value colTensor = rewriter.create<tensor::EmptyOp>(
-      loc, colTensorShape, inputType.getElementType());
-
   MLIRContext *context = rewriter.getContext();
 
-  AffineExpr nDim, icDim, khDim, kwDim, ohDim, owDim;
-  bindDims(context, nDim, icDim, khDim, kwDim, ohDim, owDim);
+  SmallVector<ReassociationIndices> filterReassocIndices = {{0}, {1, 2, 3}};
+  auto reshapedFilterType =
+      RankedTensorType::get({oc, ic * fh * fw}, inputType.getElementType());
+  Value reshapedFilter = rewriter.create<tensor::CollapseShapeOp>(
+      loc, reshapedFilterType, filter, filterReassocIndices);
 
-  auto shSym = rewriter.getAffineConstantExpr(
-      convOp.getStrides().getValues<int64_t>()[0]);
-  auto swSym = rewriter.getAffineConstantExpr(
-      convOp.getStrides().getValues<int64_t>()[1]);
+  SmallVector<ReassociationIndices> outputReassocIndices = {{0}, {1}, {2, 3}};
+  auto reshapedOutputType =
+      RankedTensorType::get({n, oc, oh * ow}, outputType.getElementType());
+  Value reshapedOutput = rewriter.create<tensor::CollapseShapeOp>(
+      loc, reshapedOutputType, output, outputReassocIndices);
 
-  SmallVector<AffineExpr, 4> inputExprs = {nDim, icDim, ohDim * shSym + khDim,
-                                           owDim * swSym + kwDim};
+  // Convert the input to a (BKN) tensor.
+  SmallVector<int64_t, 4> colTensorShape = {n, ic * fh * fw, oh * ow};
+  Value colTensor = rewriter.create<tensor::EmptyOp>(
+      loc, colTensorShape, inputType.getElementType());
 
   auto nloops = colTensorShape.size();
 
@@ -402,83 +430,67 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp) {
   SmallVector<utils::IteratorType, 3> img2colIterators(nloops, parallel);
 
   SmallVector<AffineMap, 4> img2colIndexingMaps = {
-      AffineMap::get(nloops, 0, inputExprs, context),
       AffineMap::getMultiDimIdentityMap(nloops, context)};
 
   auto img2ColTensor = rewriter.create<linalg::GenericOp>(
       loc, colTensor.getType(),
-      /*inputs=*/input, /*outputs=*/colTensor, img2colIndexingMaps,
+      /*inputs=*/ValueRange{}, /*outputs=*/colTensor, img2colIndexingMaps,
       img2colIterators,
       [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
-        nestedBuilder.create<linalg::YieldOp>(nestedLoc, args[0]);
+        // Get the iterators named based on the matmul (batch, m, k).
+        Value bIndex = nestedBuilder.create<linalg::IndexOp>(loc, 0);
+        Value kIndex = nestedBuilder.create<linalg::IndexOp>(loc, 1);
+        Value nIndex = nestedBuilder.create<linalg::IndexOp>(loc, 2);
+
+        // Recover the original iteration indices from the problem/input sizes.
+        SmallVector<Value, 3> kIndices = unrollIndex(
+            nestedBuilder, nestedLoc, kIndex, ArrayRef<int64_t>{ic, fh, fw});
+        auto icIndex = kIndices[0];
+        auto fhIndex = kIndices[1];
+        auto fwIndex = kIndices[2];
+
+        SmallVector<Value, 3> nIndices = unrollIndex(
+            nestedBuilder, nestedLoc, nIndex, ArrayRef<int64_t>{oh, ow});
+        auto ohIndex = nIndices[0];
+        auto owIndex = nIndices[1];
+
+        // Extract the input element corresponding to the expanded indices.
+        Value hIndex =
+            getConvolvedIndex(nestedBuilder, nestedLoc, ohIndex, fhIndex,
+                              convOp.getStrides().getValues<int64_t>()[0]);
+        Value wIndex =
+            getConvolvedIndex(nestedBuilder, nestedLoc, owIndex, fwIndex,
+                              convOp.getStrides().getValues<int64_t>()[1]);
+
+        // im2col[n, ic*fh*fw, oh*ow] = input[n, ic, sh*oh + fh, sw*ow + fw]
+        SmallVector<Value> extractionIndices{bIndex, icIndex, hIndex, wIndex};
+        Value inputVal = nestedBuilder.create<tensor::ExtractOp>(
+            loc, input, extractionIndices);
+        nestedBuilder.create<linalg::YieldOp>(nestedLoc, inputVal);
       });
 
-  SmallVector<ReassociationIndices> filterReassocIndices = {{0}, {1, 2, 3}};
-  auto reshapedFilterType =
-      RankedTensorType::get({oc, fh * fw * ic}, inputType.getElementType());
-  Value reshapedFilter = rewriter.create<tensor::CollapseShapeOp>(
-      loc, reshapedFilterType, filter, filterReassocIndices);
-
-  SmallVector<ReassociationIndices> img2ColTensorReassocIndices;
-  SmallVector<ReassociationIndices> outputReassocIndices;
-  RankedTensorType reshapedImg2ColTensorType, reshapedOutputType;
-  if (n == 1) {
-    img2ColTensorReassocIndices = {{0, 1, 2, 3}, {4, 5}};
-    outputReassocIndices = {{0, 1}, {2, 3}};
-
-    reshapedImg2ColTensorType = RankedTensorType::get(
-        {fh * fw * ic, oh * ow}, inputType.getElementType());
-    reshapedOutputType =
-        RankedTensorType::get({oc, oh * ow}, outputType.getElementType());
-  } else {
-    img2ColTensorReassocIndices = {{0}, {1, 2, 3}, {4, 5}};
-    outputReassocIndices = {{0}, {1}, {2, 3}};
-
-    reshapedImg2ColTensorType = RankedTensorType::get(
-        {n, fh * fw * ic, oh * ow}, inputType.getElementType());
-    reshapedOutputType =
-        RankedTensorType::get({n, oc, oh * ow}, outputType.getElementType());
-  }
-
-  Value reshapedImg2ColTensor = rewriter.create<tensor::CollapseShapeOp>(
-      loc, reshapedImg2ColTensorType, img2ColTensor.getResult(0),
-      img2ColTensorReassocIndices);
-
-  Value reshapedOutput = rewriter.create<tensor::CollapseShapeOp>(
-      loc, reshapedOutputType, output, outputReassocIndices);
-
-  Value result;
-  if (n == 1) {
-    auto matmulOp = rewriter.create<linalg::MatmulOp>(
-        loc, reshapedOutputType,
-        ArrayRef<Value>{reshapedFilter, reshapedImg2ColTensor},
-        ArrayRef<Value>{reshapedOutput});
-    result = matmulOp.getResults().front();
-  } else {
-    // For cases where batch is not 1, we need to keep the batch dimension
-    // separate. Because the filter does not share the same batch dimension,
-    // the batch dimension is only used in indexing the input and output. Thus
-    // we cannot use existing linalg named ops like linalg.batch_matmul.
-    // i.e. M x K * (B x) K x N = (B x) M x N
-    AffineExpr bDim, mDim, nDim, kDim;
-    bindDims(context, bDim, mDim, nDim, kDim);
-    auto lhsMap = AffineMap::get(4, 0, {mDim, kDim}, context);
-    auto rhsMap = AffineMap::get(4, 0, {bDim, kDim, nDim}, context);
-    auto resultMap = AffineMap::get(4, 0, {bDim, mDim, nDim}, context);
-    SmallVector<utils::IteratorType> genericIterators = {parallel, parallel,
-                                                         parallel, reduction};
-    auto genericOp = rewriter.create<linalg::GenericOp>(
-        loc, reshapedOutputType,
-        /*inputs=*/ValueRange{reshapedFilter, reshapedImg2ColTensor},
-        /*outputs=*/ValueRange{reshapedOutput},
-        ArrayRef<AffineMap>{lhsMap, rhsMap, resultMap}, genericIterators,
-        [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
-          Value mul = createMul(loc, args[0], args[1], nestedBuilder);
-          Value add = createAdd(loc, mul, args[2], nestedBuilder);
-          nestedBuilder.create<linalg::YieldOp>(nestedLoc, add);
-        });
-    result = genericOp.getResults().front();
-  }
+  // Because the filter does not share the same batch dimension,
+  // the batch dimension is only used in indexing the input and output. Thus
+  // we cannot use existing linalg named ops like linalg.batch_matmul.
+  // i.e. M x K * (B x) K x N = (B x) M x N
+  AffineExpr bDim, mDim, nDim, kDim;
+  bindDims(context, bDim, mDim, nDim, kDim);
+  auto lhsMap = AffineMap::get(4, 0, {mDim, kDim}, context);
+  auto rhsMap = AffineMap::get(4, 0, {bDim, kDim, nDim}, context);
+  auto resultMap = AffineMap::get(4, 0, {bDim, mDim, nDim}, context);
+  SmallVector<utils::IteratorType> genericIterators = {parallel, parallel,
+                                                       parallel, reduction};
+  auto genericOp = rewriter.create<linalg::GenericOp>(
+      loc, reshapedOutputType,
+      /*inputs=*/ValueRange{reshapedFilter, img2ColTensor.getResult(0)},
+      /*outputs=*/ValueRange{reshapedOutput},
+      ArrayRef<AffineMap>{lhsMap, rhsMap, resultMap}, genericIterators,
+      [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
+        Value mul = createMul(loc, args[0], args[1], nestedBuilder);
+        Value add = createAdd(loc, mul, args[2], nestedBuilder);
+        nestedBuilder.create<linalg::YieldOp>(nestedLoc, add);
+      });
+  Value result = genericOp.getResults().front();
 
   auto reshapedResult = rewriter.create<tensor::ExpandShapeOp>(
       loc, outputType, result, outputReassocIndices);
diff --git a/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir b/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
index e33e51ddababb..ffcba1086f3f6 100644
--- a/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
+++ b/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
@@ -29,36 +29,71 @@ transform.sequence failures(propagate) {
 
 // CHECK: IR printer: tensor_producer
 // CHECK-NEXT: %[[COL_TENSOR:.+]] = linalg.generic
-// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1 + d3, d2 + d4, d5)>,
-// CHECK-SAME: affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>]
-// CHECK: ^bb0(%[[IN_DATA:.+]]: f32, %[[OUT_DATA:.+]]: f32)
-// CHECK: linalg.yield %[[IN_DATA]] : f32
+// CHECK-SAME: affine_map<(d0, d1, d2) -> (d0, d1, d2)>]
+// CHECK: ^bb0(%[[OUT_DATA:.+]]: f32)
+
+// Collapsed indices.
+// CHECK: %[[BINDEX:.+]] = linalg.index 0 : index
+// CHECK: %[[MINDEX:.+]] = linalg.index 1 : index
+// CHECK: %[[KINDEX:.+]] = linalg.index 2 : index
+
+// Unrolled output shape indices.
+// CHECK: %[[C14:.+]] = arith.constant 14 : index
+// CHECK: %[[OWINDEX:.+]] = arith.remui %[[MINDEX]], %[[C14]] : index
+// CHECK: %[[C14_1:.+]] = arith.constant 14 : index
+// CHECK: %[[OHINDEX:.+]] = arith.divui %[[MINDEX]], %[[C14_1]] : index
+
+// Unrolled filter shape indices.
+// CHECK: %[[C4:.+]] = arith.constant 4 : index
+// CHECK: %[[ICINDEX:.+]] = arith.remui %[[KINDEX]], %[[C4]] : index
+// CHECK: %[[C12:.+]] = arith.constant 12 : index
+// CHECK: %[[FWREM:.+]] = arith.remui %[[KINDEX]], %[[C12]] : index
+// CHECK: %[[C4_2:.+]] = arith.constant 4 : index
+// CHECK: %[[FWINDEX:.+]] = arith.divui %[[FWREM]], %[[C4_2]] : index
+// CHECK: %[[C12_3:.+]] = arith.constant 12 : index
+// CHECK: %[[FHINDEX:.+]] = arith.divui %[[KINDEX]], %[[C12_3]] : index
+
+// Compute input indices.
+// CHECK: %[[SH:.+]] = arith.constant 1 : index
+// CHECK: %[[STRIDEDOH:.+]] = arith.muli %[[OHINDEX]], %[[SH]] : index
+// CHECK: %[[CONVH:.+]] = arith.addi %[[STRIDEDOH]], %[[FHINDEX]] : index
+// CHECK: %[[SW:.+]] = arith.constant 1 : index
+// CHECK: %[[STRIDEDOW:.+]] = arith.muli %[[OWINDEX]], %[[SW]] : index
+// CHECK: %[[CONVW:.+]] = arith.addi %[[STRIDEDOW]], %[[FWINDEX]] : index
+// CHECK: %[[EXTRACTED_INPUT:.+]] = tensor.extract
+// CHECK-SAME: %{{.+}}{{\[}}%[[BINDEX]], %[[CONVH]], %[[CONVW]], %[[ICINDEX]]] : tensor<1x16x16x4xf32>
+// CHECK: linalg.yield %[[EXTRACTED_INPUT]] : f32
 
 // CHECK: IR printer: transformed
-// CHECK: tensor.expand_shape %{{[^ ]*}} {{\[}}[0, 1, 2], [3]] : tensor<196x16xf32> into tensor<1x14x14x16xf32>
+// CHECK: tensor.expand_shape %{{[^ ]*}} {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
 
-// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1 + d3, d2 + d4, d5)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
+// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 //      CHECK: @conv_16433136
-//      CHECK: %[[INPUT:.+]]: tensor<1x16x16x4xf32>
-//      CHECK: %[[FILTER:.+]]: tensor<3x3x4x16xf32>
-//      CHECK: %[[OUTPUT:.+]]: tensor<1x14x14x16xf32>
-//      CHECK: %[[INIT_COL_TENSOR:.+]] = tensor.empty() : tensor<1x14x14x3x3x4xf32>
+//      CHECK-SAME: %[[INPUT:.+]]: tensor<1x16x16x4xf32>
+//      CHECK-SAME: %[[FILTER:.+]]: tensor<3x3x4x16xf32>
+//      CHECK-SAME: %[[OUTPUT:.+]]: tensor<1x14x14x16xf32>
+//  CHECK-DAG: %[[COLLAPSED_FILTER:.+]] = tensor.collapse_shape %[[FILTER]] {{\[}}[0, 1, 2], [3]] : tensor<3x3x4x16xf32> into tensor<36x16xf32>
+//  CHECK-DAG: %[[COLLAPSED_OUT:.+]] = tensor.collapse_shape %[[OUTPUT]] {{\[}}[0], [1, 2], [3]] : tensor<1x14x14x16xf32> into tensor<1x196x16xf32>
+//      CHECK: %[[INIT_COL_TENSOR:.+]] = tensor.empty() : tensor<1x196x36xf32>
 //      CHECK: %[[COL_TENSOR:.+]] = linalg.generic
 //           CHECK-SAME: #[[MAP0]]
+//                CHECK: ^bb0(%[[OUT_DATA:.+]]: f32)
+//                CHECK: linalg.yield %{{.+}} : f32
+//      CHECK: %[[MATMUL_RESULT:.+]] = linalg.generic
 //           CHECK-SAME: #[[MAP1]]
-//                CHECK: ^bb0(%[[IN_DATA:.+]]: f32, %[[OUT_DATA:.+]]: f32)
-//                CHECK: linalg.yield %[[IN_DATA]] : f32
-//      CHECK-DAG: %[[RESHAPED_INIT_COL_TENSOR:.+]] = tensor.collapse_shape %[[COL_TENSOR]]
-//           CHECK-SAME: [0, 1, 2], [3, 4, 5]
-//           CHECK-SAME: tensor<1x14x14x3x3x4xf32> into tensor<196x36xf32>
-//      CHECK-DAG: %[[RESHAPED_FILTER:.+]] = tensor.collapse_shape %[[FILTER]]
-//           CHECK-SAME: [0, 1, 2], [3]
-//           CHECK-SAME: tensor<3x3x4x16xf32> into tensor<36x16xf32>
-//      CHECK-DAG: %[[RESHAPED_OUTPUT:.+]] = tensor.collapse_shape %[[OUTPUT]]
-//           CHECK-SAME: [0, 1, 2], [3]
-//      CHECK: %[[MATMUL_RESULT:.+]] = linalg.matmul ins(%[[RESHAPED_INIT_COL_TENSOR]], %[[RESHAPED_FILTER]] : tensor<196x36xf32>, tensor<36x16xf32>) outs(%[[RESHAPED_OUTPUT]] : tensor<196x16xf32>)
-//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0, 1, 2], [3]] : tensor<196x16xf32> into tensor<1x14x14x16xf32>
+//           CHECK-SAME: #[[MAP2]]
+//           CHECK-SAME: #[[MAP3]]
+//           CHECK-SAME: ins(%[[COL_TENSOR]], %[[COLLAPSED_FILTER]] : tensor<1x196x36xf32>, tensor<36x16xf32>)
+//           CHECK-SAME: outs(%[[COLLAPSED_OUT]] : tensor<1x196x16xf32>)
+//                CHECK: ^bb0(%[[ARG0:.+]]: f32, %[[ARG1:.+]]: f32, %[[ARG2:.+]]: f32)
+//                CHECK:     %[[MUL:.+]] = arith.mulf %[[ARG0]], %[[ARG1]] : f32
+//                CHECK:     %[[ADD:.+]] = arith.addf %[[MUL]], %[[ARG2]] : f32
+//                CHECK:     linalg.yield %[[ADD]] : f32
+//                CHECK: } -> tensor<1x196x16xf32>
+//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
 //      CHECK: return %[[RESULT]]
 
 func.func @conv_16433136(%arg0: tensor<1x16x16x4xf32>, %arg1: tensor<3x3x4x16xf32>, %arg2: tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32> {
@@ -156,27 +191,24 @@ transform.sequence failures(propagate) {
 
 // -----
 
-//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1 + d3, d2 + d4, d5)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //  CHECK-DAG: #[[LHSMAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[RHSMAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
 //  CHECK-DAG: #[[RESMAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 
 //      CHECK: func.func @batch_nhwc_conv
 // CHECK-SAME: (%[[INPUT:.+]]: tensor<8x16x16x4xf32>, %[[FILTER:.+]]: tensor<3x3x4x16xf32>, %[[INIT:.+]]: tensor<8x14x14x16xf32>)
-//      CHECK:   %[[IT:.+]] = tensor.empty() : tensor<8x14x14x3x3x4xf32>
+//  CHECK-DAG:   %[[CS_FILTER:.+]] = tensor.collapse_shape %[[FILTER]] {{\[}}[0, 1, 2], [3]] : tensor<3x3x4x16xf32> into tensor<36x16xf32>
+//  CHECK-DAG:   %[[CS_RESULT:.+]] = tensor.collapse_shape %[[INIT]] {{\[}}[0], [1, 2], [3]] : tensor<8x14x14x16xf32> into tensor<8x196x16xf32>
+//      CHECK:   %[[IT:.+]] = tensor.empty() : tensor<8x196x36xf32>
 //      CHECK:   %[[IMG2COL:.+]] = linalg.generic
-// CHECK-SAME:      indexing_maps = [#[[MAP0]], #[[MAP1]]]
-// CHECK-SAME:      iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
-// CHECK-SAME:   ins(%[[INPUT]] : tensor<8x16x16x4xf32>)
-// CHECK-SAME:   outs(%[[IT]] : tensor<8x14x14x3x3x4xf32>)
-//      CHECK:   %[[CS_INPUT:.+]] = tensor.collapse_shape %[[IMG2COL]] {{\[}}[0], [1, 2], [3, 4, 5]] : tensor<8x14x14x3x3x4xf32> into tensor<8x196x36xf32>
-//      CHECK:   %[[CS_FILTER:.+]] = tensor.collapse_shape %[[FILTER]] {{\[}}[0, 1, 2], [3]] : tensor<3x3x4x16xf32> into tensor<36x16xf32>
-//      CHECK:   %[[CS_RESULT:.+]] = tensor.collapse_shape %[[INIT]] {{\[}}[0], [1, 2], [3]] : tensor<8x14x14x16xf32> into tensor<8x196x16xf32>
+// CHECK-SAME:      indexing_maps = [#[[MAP]]]
+// CHECK-SAME:      iterator_types = ["parallel", "parallel", "parallel"]
+// CHECK-SAME:   outs(%[[IT]] : tensor<8x196x36xf32>)
 //      CHECK:   %[[MATMUL:.+]] = linalg.generic
 // CHECK-SAME:      indexing_maps = [#[[LHSMAP]], #[[RHSMAP]], #[[RESMAP]]],
 // CHECK-SAME:      iterator_types = ["parallel", "parallel", "parallel", "reduction"]
-// CHECK-SAME:   ins(%[[CS_INPUT]], %[[CS_FILTER]] : tensor<8x196x36xf32>, tensor<36x16xf32>)
+// CHECK-SAME:   ins(%[[IMG2COL]], %[[CS_FILTER]] : tensor<8x196x36xf32>, tensor<36x16xf32>)
 // CHECK-SAME:   outs(%[[CS_RESULT]] : tensor<8x196x16xf32>)
 //      CHECK:   ^bb0(%[[ARG0:.+]]: f32, %[[ARG1:.+]]: f32, %[[ARG2:.+]]: f32):
 //      CHECK:     %[[MUL:.+]] = arith.mulf %[[ARG0]], %[[ARG1]] : f32
@@ -201,27 +233,55 @@ transform.sequence failures(propagate) {
 
 // -----
 
-//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4 + d2, d5 + d3)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //  CHECK-DAG: #[[LHSMAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
 //  CHECK-DAG: #[[RHSMAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[RESMAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 
 //      CHECK: func.func @batch_nchw_conv
 // CHECK-SAME: (%[[INPUT:.+]]: tensor<8x4x16x16xf32>, %[[FILTER:.+]]: tensor<16x4x3x3xf32>, %[[INIT:.+]]: tensor<8x16x14x14xf32>)
-//      CHECK:   %[[IT:.+]] = tensor.empty() : tensor<8x4x3x3x14x14xf32>
+//  CHECK-DAG:   %[[CS_FILTER:.+]] = tensor.collapse_shape %[[FILTER]] {{\[}}[0], [1, 2, 3]] : tensor<16x4x3x3xf32> into tensor<16x36xf32>
+//  CHECK-DAG:   %[[CS_RESULT:.+]] = tensor.collapse_shape %[[INIT]] {{\[}}[0], [1], [2, 3]] : tensor<8x16x14x14xf32> into tensor<8x16x196xf32>
+//      CHECK:   %[[IT:.+]] = tensor.empty() : tensor<8x36x196xf32>
 //      CHECK:   %[[IMG2COL:.+]] = linalg.generic
-// CHECK-SAME:      indexing_maps = [#[[MAP0]], #[[MAP1]]]
-// CHECK-SAME:      iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
-// CHECK-SAME:   ins(%[[INPUT]] : tensor<8x4x16x16xf32>)
-// CHECK-SAME:   outs(%[[IT]] : tensor<8x4x3x3x14x14xf32>)
-//      CHECK:   %[[CS_FILTER:.+]] = tensor.collapse_shape %[[FILTER]] {{\[}}[0], [1, 2, 3]] : tensor<16x4x3x3xf32> into tensor<16x36xf32>
-//      CHECK:   %[[CS_INPUT:.+]] = tensor.collapse_shape %[[IMG2COL]] {{\[}}[0], [1, 2, 3], [4, 5]] : tensor<8x4x3x3x14x14xf32> into tensor<8x36x196xf32>
-//      CHECK:   %[[CS_RESULT:.+]] = tensor.collapse_shape %[[INIT]] {{\[}}[0], [1], [2, 3]] : tensor<8x16x14x14xf32> into tensor<8x16x196xf32>
+// CHECK-SAME:      indexing_maps = [#[[MAP]]]
+// CHECK-SAME:      iterator_types = ["parallel", "parallel", "parallel"]
+// CHECK-SAME:   outs(%[[IT]] : tensor<8x36x196xf32>)
+//      Collapsed indices.
+//      CHECK:       %[[BINDEX:.+]] = linalg.index 0 : index
+//      CHECK:       %[[KINDEX:.+]] = linalg.index 1 : index
+//      CHECK:       %[[NINDEX:.+]] = linalg.index 2 : index
+
+//      Unrolled filter shape indices.
+//      CHECK:       %[[C3:.+]] = arith.constant 3 : index
+//      CHECK:       %[[FWINDEX:.+]] = arith.remui %[[KINDEX]], %[[C3]] : index
+//      CHECK:       %[[C9:.+]] = arith.constant 9 : index
+//      CHECK:       %[[FHREM:.+]] = arith.remui %[[KINDEX]], %[[C9]] : index
+//      CHECK:       %[[C3_1:.+]] = arith.constant 3 : index
+//      CHECK:       %[[FHINDEX:.+]] = arith.divui %[[FHREM]], %[[C3_1]] : index
+//      CHECK:       %[[C9_2:.+]] = arith.constant 9 : index
+//      CHECK:       %[[ICINDEX:.+]] = arith.divui %[[KINDEX]], %[[C9_2]] : index
+
+//      Unrolled output shape indices.
+//      CHECK:       %[[C14:.+]] = arith.constant 14 : index
+//      CHECK:       %[[OWINDEX:.+]] = arith.remui %[[NINDEX]], %[[C14]] : index
+//      CHECK:       %[[C14_3:.+]] = arith.constant 14 : index
+//      CHECK:       %[[OHINDEX:.+]] = arith.divui %[[NINDEX]], %[[C14_3]] : index
+
+//      Compute input indices.
+//      CHECK:       %[[SH:.+]] = arith.constant 1 : index
+//      CHECK:       %[[STRIDEDOH:.+]] = arith.muli %[[OHINDEX]], %[[SH]] : index
+//      CHECK:       %[[CONVH:.+]] = arith.addi %[[STRIDEDOH]], %[[FHINDEX]] : index
+//      CHECK:       %[[SW:.+]] = arith.constant 1 : index
+//      CHECK:       %[[STRIDEDOW:.+]] = arith.muli %[[OWINDEX]], %[[SW]] : index
+//      CHECK:       %[[CONVW:.+]] = arith.addi %[[STRIDEDOW]], %[[FWINDEX]] : index
+//      CHECK:       %[[EXTRACTED_INPUT:.+]] = tensor.extract
+//      CHECK-SAME:  %[[INPUT]]{{\[}}%[[BINDEX]], %[[ICINDEX]], %[[CONVH]], %[[CONVW]]] : tensor<8x4x16x16xf32>
+//      CHECK: linalg.yield %[[EXTRACTED_INPUT]] : f32
 //      CHECK:   %[[MATMUL:.+]] = linalg.generic
 // CHECK-SAME:      indexing_maps = [#[[LHSMAP]], #[[RHSMAP]], #[[RESMAP]]],
 // CHECK-SAME:      iterator_types = ["parallel", "parallel", "parallel", "reduction"]
-// CHECK-SAME:   ins(%[[CS_FILTER]], %[[CS_INPUT]] : tensor<16x36xf32>, tensor<8x36x196xf32>)
+// CHECK-SAME:   ins(%[[CS_FILTER]], %[[IMG2COL]] : tensor<16x36xf32>, tensor<8x36x196xf32>)
 // CHECK-SAME:   outs(%[[CS_RESULT]] : tensor<8x16x196xf32>)
 //      CHECK:   ^bb0(%[[ARG0:.+]]: f32, %[[ARG1:.+]]: f32, %[[ARG2:.+]]: f32):
 //      CHECK:     %[[MUL:.+]] = arith.mulf %[[ARG0]], %[[ARG1]] : f32

From 24847a90aaf5842041a2cfa977a4167997307b50 Mon Sep 17 00:00:00 2001
From: LiaoChunyu <chunyu@iscas.ac.cn>
Date: Fri, 24 Mar 2023 09:04:59 +0800
Subject: [PATCH 192/208] [LegalizeTypes][RISCV] Add a special case for (add X,
 -1) to ExpandIntRes_ADDSUB

 On targets without ADDCARRY or ADDE, we need to emit a separate
 SETCC to determine carry from the low half to the high half.
 The high half is calculated by a series of ADDs.

 When RHSLo and RHSHi are -1, without this patch, we get:
   Hi = (add (add LHSHi,(setult Lo, LHSLo), -1)
 Where as with the patch we get:
   Hi = (sub LHSHi, (seteq LHSLo, 0))

 Only RHSLo is -1 we can instead do (setne Lo, 0).

 Similar to gcc: https://godbolt.org/z/M83f6rz39

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D146635
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 14 +++-
 llvm/test/CodeGen/RISCV/alu64.ll              | 73 +++++++++++++++++++
 .../CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll | 10 +--
 .../test/CodeGen/RISCV/overflow-intrinsics.ll | 39 +++++-----
 llvm/test/CodeGen/RISCV/sext-zext-trunc.ll    | 14 ++--
 5 files changed, 113 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index f741ee4849dfc..c4f2fbc90e3eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3026,7 +3026,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     if (isOneConstant(LoOps[1]))
       Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo,
                          DAG.getConstant(0, dl, NVT), ISD::SETEQ);
-    else
+    else if (isAllOnesConstant(LoOps[1])) {
+      if (isAllOnesConstant(HiOps[1]))
+        Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), LoOps[0],
+                           DAG.getConstant(0, dl, NVT), ISD::SETEQ);
+      else
+        Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), LoOps[0],
+                           DAG.getConstant(0, dl, NVT), ISD::SETNE);
+    } else
       Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
                          ISD::SETULT);
 
@@ -3037,7 +3044,10 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
       Carry = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
                              DAG.getConstant(0, dl, NVT));
 
-    Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
+    if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1]))
+      Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps[0], Carry);
+    else
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
   } else {
     Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps);
     Hi = DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2));
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index 5349c82ef0f0f..29eb12f7f5429 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -530,3 +530,76 @@ define signext i32 @sraw(i64 %a, i32 zeroext %b) nounwind {
   %2 = ashr i32 %1, %b
   ret i32 %2
 }
+
+define i64 @add_hi_and_lo_negone(i64 %0) {
+; RV64I-LABEL: add_hi_and_lo_negone:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: add_hi_and_lo_negone:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    seqz a2, a0
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    ret
+  %2 = add nsw i64 %0, -1
+  ret i64 %2
+}
+
+define i64 @add_hi_zero_lo_negone(i64 %0) {
+; RV64I-LABEL: add_hi_zero_lo_negone:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: add_hi_zero_lo_negone:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    ret
+  %2 = add i64 %0, 4294967295
+  ret i64 %2
+}
+
+define i64 @add_lo_negone(i64 %0) {
+; RV64I-LABEL: add_lo_negone:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: add_lo_negone:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    addi a1, a1, -2
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    ret
+  %2 = add nsw i64 %0, -4294967297
+  ret i64 %2
+}
+
+define i64 @add_hi_one_lo_negone(i64 %0) {
+; RV64I-LABEL: add_hi_one_lo_negone:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    srli a1, a1, 31
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: add_hi_one_lo_negone:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    addi a1, a1, 1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    ret
+  %2 = add nsw i64 %0, 8589934591
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index f1528e94c473c..dc4b50215ab0a 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -1263,10 +1263,9 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    bnez a0, .LBB7_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32I-NEXT:    seqz a0, a4
+; RV32I-NEXT:    sub a3, a5, a0
 ; RV32I-NEXT:    addi a2, a4, -1
-; RV32I-NEXT:    sltu a0, a2, a4
-; RV32I-NEXT:    add a0, a5, a0
-; RV32I-NEXT:    addi a3, a0, -1
 ; RV32I-NEXT:    j .LBB7_1
 ; RV32I-NEXT:  .LBB7_7: # %atomicrmw.end
 ; RV32I-NEXT:    mv a0, a4
@@ -1327,10 +1326,9 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    bnez a0, .LBB7_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IA-NEXT:    seqz a0, a4
+; RV32IA-NEXT:    sub a3, a5, a0
 ; RV32IA-NEXT:    addi a2, a4, -1
-; RV32IA-NEXT:    sltu a0, a2, a4
-; RV32IA-NEXT:    add a0, a5, a0
-; RV32IA-NEXT:    addi a3, a0, -1
 ; RV32IA-NEXT:    j .LBB7_1
 ; RV32IA-NEXT:  .LBB7_7: # %atomicrmw.end
 ; RV32IA-NEXT:    mv a0, a4
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 543c17f748a45..acad770b693d2 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -666,14 +666,13 @@ define i1 @uaddo_i64_increment_alt_dom(i64 %x, ptr %p) {
 define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) {
 ; RV32-LABEL: uaddo_i64_decrement_alt:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi a3, a0, -1
-; RV32-NEXT:    sltu a4, a3, a0
-; RV32-NEXT:    add a4, a1, a4
-; RV32-NEXT:    addi a4, a4, -1
-; RV32-NEXT:    sw a3, 0(a2)
+; RV32-NEXT:    seqz a3, a0
+; RV32-NEXT:    sub a3, a1, a3
+; RV32-NEXT:    addi a4, a0, -1
+; RV32-NEXT:    sw a4, 0(a2)
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    sw a4, 4(a2)
+; RV32-NEXT:    sw a3, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo_i64_decrement_alt:
@@ -695,12 +694,11 @@ define i1 @uaddo_i64_decrement_alt_dom(i64 %x, ptr %p) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
 ; RV32-NEXT:    snez a3, a3
-; RV32-NEXT:    addi a4, a0, -1
-; RV32-NEXT:    sltu a0, a4, a0
-; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    sub a1, a1, a4
 ; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    sw a4, 0(a2)
-; RV32-NEXT:    sw a0, 4(a2)
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    sw a1, 4(a2)
 ; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
@@ -1222,22 +1220,21 @@ define i64 @foo2(ptr %p) {
 define void @PR41129(ptr %p64) {
 ; RV32-LABEL: PR41129:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a1, 4(a0)
-; RV32-NEXT:    lw a2, 0(a0)
-; RV32-NEXT:    or a3, a2, a1
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    or a3, a1, a2
 ; RV32-NEXT:    beqz a3, .LBB36_2
 ; RV32-NEXT:  # %bb.1: # %false
-; RV32-NEXT:    andi a2, a2, 7
+; RV32-NEXT:    andi a1, a1, 7
 ; RV32-NEXT:    sw zero, 4(a0)
-; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a1, 0(a0)
 ; RV32-NEXT:    ret
 ; RV32-NEXT:  .LBB36_2: # %true
-; RV32-NEXT:    addi a3, a2, -1
-; RV32-NEXT:    sltu a2, a3, a2
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    seqz a3, a1
+; RV32-NEXT:    sub a2, a2, a3
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a1, 4(a0)
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw a2, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: PR41129:
diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index 58469ca23bb54..6be6785fc1d0e 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -454,10 +454,9 @@ define i32 @sext_of_not_i32(i1 %x) {
 define i64 @sext_of_not_i64(i1 %x) {
 ; RV32I-LABEL: sext_of_not_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a1, a0, 1
-; RV32I-NEXT:    addi a0, a1, -1
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sext_of_not_i64:
@@ -541,10 +540,9 @@ define i64 @dec_of_zexted_cmp_i64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xori a0, a0, 7
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    seqz a1, a0
-; RV32I-NEXT:    addi a0, a1, -1
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: dec_of_zexted_cmp_i64:

From ccd96b3e03e18653e909852bfef105fc10782acb Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 23 Mar 2023 19:08:53 -0700
Subject: [PATCH 193/208] [builtins][test] Fix divmodti4_test.c on Windows

By making the 64 bit integer literals unsigned. Otherwise some of them
are unexpectedly sign extended (and the compiler rightly diagnosed this
with warnings)

Initially added in D80506.

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D146667
---
 .../test/builtins/Unit/divmodti4_test.c       | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/compiler-rt/test/builtins/Unit/divmodti4_test.c b/compiler-rt/test/builtins/Unit/divmodti4_test.c
index 26b3c1609f18c..7b8d3faae2cdf 100644
--- a/compiler-rt/test/builtins/Unit/divmodti4_test.c
+++ b/compiler-rt/test/builtins/Unit/divmodti4_test.c
@@ -55,22 +55,22 @@ char assumption_1[sizeof(ti_int) == 2*sizeof(di_int)] = {0};
 
 tu_int tests[][4] =
 {
-{ (ti_int) 0,                             (ti_int) 1, (ti_int) 0,                                                (ti_int) 0 },
-{ (ti_int) 0,                             (ti_int)-1, (ti_int) 0,                                                (ti_int) 0 },
-{ (ti_int) 2,                             (ti_int) 1, (ti_int) 2,                                                (ti_int) 0 },
-{ (ti_int) 2,                             (ti_int)-1, (ti_int)-2,                                                (ti_int) 0 },
-{ (ti_int)-2,                             (ti_int) 1, (ti_int)-2,                                                (ti_int) 0 },
-{ (ti_int)-2,                             (ti_int)-1, (ti_int) 2,                                                (ti_int) 0 },
-{ (ti_int) 5,                             (ti_int) 3, (ti_int) 1,                                                (ti_int) 2 },
-{ (ti_int) 5,                             (ti_int)-3, (ti_int)-1,                                                (ti_int) 2 },
-{ (ti_int)-5,                             (ti_int) 3, (ti_int)-1,                                                (ti_int)-2 },
-{ (ti_int)-5,                             (ti_int)-3, (ti_int) 1,                                                (ti_int)-2 },
-{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int) 1, (ti_int)0x8000000000000000LL << 64 | 0,                    (ti_int)0x0LL },
-{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int)-1, (ti_int)0x8000000000000000LL << 64 | 0,                    (ti_int)0x0LL },
-{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int)-2, (ti_int)0x4000000000000000LL << 64 | 0,                    (ti_int)0x0LL },
-{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int) 2, (ti_int)0xC000000000000000LL << 64 | 0,                    (ti_int)0x0LL },
-{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int)-3, (ti_int)0x2AAAAAAAAAAAAAAALL << 64 | 0xAAAAAAAAAAAAAAAALL, (ti_int)-2 },
-{ (ti_int)0x8000000000000000LL << 64 | 0, (ti_int) 3, (ti_int)0xD555555555555555LL << 64 | 0x5555555555555556LL, (ti_int)-2 },
+{ (ti_int) 0,                              (ti_int) 1, (ti_int) 0,                                                  (ti_int) 0 },
+{ (ti_int) 0,                              (ti_int)-1, (ti_int) 0,                                                  (ti_int) 0 },
+{ (ti_int) 2,                              (ti_int) 1, (ti_int) 2,                                                  (ti_int) 0 },
+{ (ti_int) 2,                              (ti_int)-1, (ti_int)-2,                                                  (ti_int) 0 },
+{ (ti_int)-2,                              (ti_int) 1, (ti_int)-2,                                                  (ti_int) 0 },
+{ (ti_int)-2,                              (ti_int)-1, (ti_int) 2,                                                  (ti_int) 0 },
+{ (ti_int) 5,                              (ti_int) 3, (ti_int) 1,                                                  (ti_int) 2 },
+{ (ti_int) 5,                              (ti_int)-3, (ti_int)-1,                                                  (ti_int) 2 },
+{ (ti_int)-5,                              (ti_int) 3, (ti_int)-1,                                                  (ti_int)-2 },
+{ (ti_int)-5,                              (ti_int)-3, (ti_int) 1,                                                  (ti_int)-2 },
+{ (ti_int)0x8000000000000000ULL << 64 | 0, (ti_int) 1, (ti_int)0x8000000000000000ULL << 64 | 0,                     (ti_int)0x0LL },
+{ (ti_int)0x8000000000000000ULL << 64 | 0, (ti_int)-1, (ti_int)0x8000000000000000ULL << 64 | 0,                     (ti_int)0x0LL },
+{ (ti_int)0x8000000000000000ULL << 64 | 0, (ti_int)-2, (ti_int)0x4000000000000000ULL << 64 | 0,                     (ti_int)0x0LL },
+{ (ti_int)0x8000000000000000ULL << 64 | 0, (ti_int) 2, (ti_int)0xC000000000000000ULL << 64 | 0,                     (ti_int)0x0LL },
+{ (ti_int)0x8000000000000000ULL << 64 | 0, (ti_int)-3, (ti_int)0x2AAAAAAAAAAAAAAAULL << 64 | 0xAAAAAAAAAAAAAAAAULL, (ti_int)-2 },
+{ (ti_int)0x8000000000000000ULL << 64 | 0, (ti_int) 3, (ti_int)0xD555555555555555ULL << 64 | 0x5555555555555556ULL, (ti_int)-2 },
 };
 
 #endif

From 24657a95c1447cc95f2634de50ff4008d8b17d4c Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Thu, 23 Mar 2023 16:06:37 -0700
Subject: [PATCH 194/208] [NFC] Fix Windows builds that use MSVC 14.x

Differential Revision: https://reviews.llvm.org/D146769
---
 mlir/lib/IR/AffineMap.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index 9ac181f46b578..6c9034d446341 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -744,13 +744,18 @@ static AffineMap projectCommonImpl(AffineMap map,
   replacements.reserve(numDimOrSym);
 
   auto createNewDimOrSym = (isDim) ? getAffineDimExpr : getAffineSymbolExpr;
-  auto replaceDims = [](AffineExpr e, ArrayRef<AffineExpr> replacements) {
+
+  using replace_fn_ty =
+      std::function<AffineExpr(AffineExpr, ArrayRef<AffineExpr>)>;
+  replace_fn_ty replaceDims = [](AffineExpr e,
+                                 ArrayRef<AffineExpr> replacements) {
     return e.replaceDims(replacements);
   };
-  auto replaceSymbols = [](AffineExpr e, ArrayRef<AffineExpr> replacements) {
+  replace_fn_ty replaceSymbols = [](AffineExpr e,
+                                    ArrayRef<AffineExpr> replacements) {
     return e.replaceSymbols(replacements);
   };
-  auto replaceNewDimOrSym = (isDim) ? replaceDims : replaceSymbols;
+  replace_fn_ty replaceNewDimOrSym = (isDim) ? replaceDims : replaceSymbols;
 
   MLIRContext *context = map.getContext();
   int64_t newNumDimOrSym = 0;

From 5f48b861f8ce2d2355347d3b3b8826f7bfd23dd6 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 23 Mar 2023 19:26:42 -0700
Subject: [PATCH 195/208] [SelectionDAG] Use isOneConstant (NFC)

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 3511e76ac1df2..8199e5d5a9c18 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3628,9 +3628,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     } else {
       // We test only the i1 bit.  Skip the AND if UNDEF or another AND.
       if (Tmp2.isUndef() ||
-          (Tmp2.getOpcode() == ISD::AND &&
-           isa<ConstantSDNode>(Tmp2.getOperand(1)) &&
-           cast<ConstantSDNode>(Tmp2.getOperand(1))->getZExtValue() == 1))
+          (Tmp2.getOpcode() == ISD::AND && isOneConstant(Tmp2.getOperand(1))))
         Tmp3 = Tmp2;
       else
         Tmp3 = DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,

From 231fa27435105e980b113754c112980ebeb8927d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 23 Mar 2023 19:26:43 -0700
Subject: [PATCH 196/208] [InstCombine] Generate better code for std::bit_ceil

Without this patch, std::bit_ceil<uint32_t> is compiled as:

  %dec = add i32 %x, -1
  %lz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
  %sub = sub i32 32, %lz
  %res = shl i32 1, %sub
  %ugt = icmp ugt i32 %x, 1
  %sel = select i1 %ugt, i32 %res, i32 1

With this patch, we generate:

  %dec = add i32 %x, -1
  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
  %sub = sub nsw i32 0, %ctlz
  %and = and i32 %1, 31
  %sel = shl nuw i32 1, %and
  ret i32 %sel

https://alive2.llvm.org/ce/z/pwezvF

This patch recognizes the specific pattern from std::bit_ceil in
libc++ and libstdc++ and drops the conditional move.  In addition to
the LLVM IR generated for std::bit_ceil(X), this patch recognizes
variants like:

  std::bit_ceil(X - 1)
  std::bit_ceil(X + 1)
  std::bit_ceil(X + 2)
  std::bit_ceil(-X)
  std::bit_ceil(~X)

This patch fixes:

https://github.com/llvm/llvm-project/issues/60802

Differential Revision: https://reviews.llvm.org/D145299
---
 .../InstCombine/InstCombineSelect.cpp         | 131 ++++++++++++++++++
 llvm/test/Transforms/InstCombine/bit_ceil.ll  |  70 ++++------
 2 files changed, 160 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 1f2441bc9fcf9..3d1dbdd6270d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3163,6 +3163,134 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
   return nullptr;
 }
 
+// Return true if we can safely remove the select instruction for std::bit_ceil
+// pattern.
+static bool isSafeToRemoveBitCeilSelect(ICmpInst::Predicate Pred, Value *Cond0,
+                                        const APInt *Cond1, Value *CtlzOp,
+                                        unsigned BitWidth) {
+  // The challenge in recognizing std::bit_ceil(X) is that the operand is used
+  // for the CTLZ proper and select condition, each possibly with some
+  // operation like add and sub.
+  //
+  // Our aim is to make sure that -ctlz & (BitWidth - 1) == 0 even when the
+  // select instruction would select 1, which allows us to get rid of the select
+  // instruction.
+  //
+  // To see if we can do so, we do some symbolic execution with ConstantRange.
+  // Specifically, we compute the range of values that Cond0 could take when
+  // Cond == false.  Then we successively transform the range until we obtain
+  // the range of values that CtlzOp could take.
+  //
+  // Conceptually, we follow the def-use chain backward from Cond0 while
+  // transforming the range for Cond0 until we meet the common ancestor of Cond0
+  // and CtlzOp.  Then we follow the def-use chain forward until we obtain the
+  // range for CtlzOp.  That said, we only follow at most one ancestor from
+  // Cond0.  Likewise, we only follow at most one ancestor from CtrlOp.
+
+  ConstantRange CR = ConstantRange::makeExactICmpRegion(
+      CmpInst::getInversePredicate(Pred), *Cond1);
+
+  // Match the operation that's used to compute CtlzOp from CommonAncestor.  If
+  // CtlzOp == CommonAncestor, return true as no operation is needed.  If a
+  // match is found, execute the operation on CR, update CR, and return true.
+  // Otherwise, return false.
+  auto MatchForward = [&](Value *CommonAncestor) {
+    const APInt *C = nullptr;
+    if (CtlzOp == CommonAncestor)
+      return true;
+    if (match(CtlzOp, m_Add(m_Specific(CommonAncestor), m_APInt(C)))) {
+      CR = CR.add(*C);
+      return true;
+    }
+    if (match(CtlzOp, m_Sub(m_APInt(C), m_Specific(CommonAncestor)))) {
+      CR = ConstantRange(*C).sub(CR);
+      return true;
+    }
+    if (match(CtlzOp, m_Not(m_Specific(CommonAncestor)))) {
+      CR = CR.binaryNot();
+      return true;
+    }
+    return false;
+  };
+
+  const APInt *C = nullptr;
+  Value *CommonAncestor;
+  if (MatchForward(Cond0)) {
+    // Cond0 is either CtlzOp or CtlzOp's parent.  CR has been updated.
+  } else if (match(Cond0, m_Add(m_Value(CommonAncestor), m_APInt(C)))) {
+    CR = CR.sub(*C);
+    if (!MatchForward(CommonAncestor))
+      return false;
+    // Cond0's parent is either CtlzOp or CtlzOp's parent.  CR has been updated.
+  } else {
+    return false;
+  }
+
+  // Return true if all the values in the range are either 0 or negative (if
+  // treated as signed).  We do so by evaluating:
+  //
+  //   CR - 1 u>= (1 << BitWidth) - 1.
+  APInt IntMax = APInt::getSignMask(BitWidth) - 1;
+  CR = CR.sub(APInt(BitWidth, 1));
+  return CR.icmp(ICmpInst::ICMP_UGE, IntMax);
+}
+
+// Transform the std::bit_ceil(X) pattern like:
+//
+//   %dec = add i32 %x, -1
+//   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
+//   %sub = sub i32 32, %ctlz
+//   %shl = shl i32 1, %sub
+//   %ugt = icmp ugt i32 %x, 1
+//   %sel = select i1 %ugt, i32 %shl, i32 1
+//
+// into:
+//
+//   %dec = add i32 %x, -1
+//   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
+//   %neg = sub i32 0, %ctlz
+//   %masked = and i32 %ctlz, 31
+//   %shl = shl i32 1, %sub
+//
+// Note that the select is optimized away while the shift count is masked with
+// 31.  We handle some variations of the input operand like std::bit_ceil(X +
+// 1).
+static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder) {
+  Type *SelType = SI.getType();
+  unsigned BitWidth = SelType->getScalarSizeInBits();
+
+  Value *FalseVal = SI.getFalseValue();
+  Value *TrueVal = SI.getTrueValue();
+  ICmpInst::Predicate Pred;
+  const APInt *Cond1;
+  Value *Cond0, *Ctlz, *CtlzOp;
+  if (!match(SI.getCondition(), m_ICmp(Pred, m_Value(Cond0), m_APInt(Cond1))))
+    return nullptr;
+
+  if (match(TrueVal, m_One())) {
+    std::swap(FalseVal, TrueVal);
+    Pred = CmpInst::getInversePredicate(Pred);
+  }
+
+  if (!match(FalseVal, m_One()) ||
+      !match(TrueVal,
+             m_OneUse(m_Shl(m_One(), m_OneUse(m_Sub(m_SpecificInt(BitWidth),
+                                                    m_Value(Ctlz)))))) ||
+      !match(Ctlz, m_Intrinsic<Intrinsic::ctlz>(m_Value(CtlzOp), m_Zero())) ||
+      !isSafeToRemoveBitCeilSelect(Pred, Cond0, Cond1, CtlzOp, BitWidth))
+    return nullptr;
+
+  // Build 1 << (-CTLZ & (BitWidth-1)).  The negation likely corresponds to a
+  // single hardware instruction as opposed to BitWidth - CTLZ, where BitWidth
+  // is an integer constant.  Masking with BitWidth-1 comes free on some
+  // hardware as part of the shift instruction.
+  Value *Neg = Builder.CreateNeg(Ctlz);
+  Value *Masked =
+      Builder.CreateAnd(Neg, ConstantInt::get(SelType, BitWidth - 1));
+  return BinaryOperator::Create(Instruction::Shl, ConstantInt::get(SelType, 1),
+                                Masked);
+}
+
 Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -3590,5 +3718,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (sinkNotIntoOtherHandOfLogicalOp(SI))
     return &SI;
 
+  if (Instruction *I = foldBitCeil(SI, Builder))
+    return I;
+
   return nullptr;
 }
diff --git a/llvm/test/Transforms/InstCombine/bit_ceil.ll b/llvm/test/Transforms/InstCombine/bit_ceil.ll
index 98f4cdb6fb834..6f714153a598a 100644
--- a/llvm/test/Transforms/InstCombine/bit_ceil.ll
+++ b/llvm/test/Transforms/InstCombine/bit_ceil.ll
@@ -6,10 +6,9 @@ define i32 @bit_ceil_32(i32 %x) {
 ; CHECK-LABEL: @bit_ceil_32(
 ; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
-; CHECK-NEXT:    [[UGT:%.*]] = icmp ugt i32 [[X]], 1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i32 1, [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %dec = add i32 %x, -1
@@ -26,10 +25,9 @@ define i64 @bit_ceil_64(i64 %x) {
 ; CHECK-LABEL: @bit_ceil_64(
 ; CHECK-NEXT:    [[DEC:%.*]] = add i64 [[X:%.*]], -1
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[DEC]], i1 false), !range [[RNG1:![0-9]+]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 64, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 1, [[SUB]]
-; CHECK-NEXT:    [[UGT:%.*]] = icmp ugt i64 [[X]], 1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT]], i64 [[SHL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i64 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], 63
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i64 1, [[TMP2]]
 ; CHECK-NEXT:    ret i64 [[SEL]]
 ;
   %dec = add i64 %x, -1
@@ -47,11 +45,9 @@ define i32 @bit_ceil_32_minus_1(i32 %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -2
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[SUB]], i1 false), !range [[RNG0]]
-; CHECK-NEXT:    [[SUB2:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB2]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[X]], -3
-; CHECK-NEXT:    [[ULT:%.*]] = icmp ult i32 [[ADD]], -2
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ULT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i32 1, [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
 entry:
@@ -69,11 +65,9 @@ entry:
 define i32 @bit_ceil_32_plus_1(i32 %x) {
 ; CHECK-LABEL: @bit_ceil_32_plus_1(
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false), !range [[RNG0]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
-; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X]], -1
-; CHECK-NEXT:    [[ULT:%.*]] = icmp ult i32 [[DEC]], -2
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ULT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i32 1, [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
@@ -91,10 +85,9 @@ define i32 @bit_ceil_plus_2(i32 %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], 1
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[SUB]], i1 false), !range [[RNG0]]
-; CHECK-NEXT:    [[SUB2:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB2]]
-; CHECK-NEXT:    [[ULT:%.*]] = icmp ult i32 [[X]], -2
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ULT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i32 1, [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
 entry:
@@ -113,11 +106,9 @@ define i32 @bit_ceil_32_neg(i32 %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB:%.*]] = xor i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[SUB]], i1 false), !range [[RNG0]]
-; CHECK-NEXT:    [[SUB2:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB2]]
-; CHECK-NEXT:    [[NOTSUB:%.*]] = add i32 [[X]], -1
-; CHECK-NEXT:    [[ULT:%.*]] = icmp ult i32 [[NOTSUB]], -2
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ULT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i32 1, [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
 entry:
@@ -137,10 +128,9 @@ define i32 @bit_ceil_not(i32 %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 -2, [[X:%.*]]
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[SUB]], i1 false), !range [[RNG0]]
-; CHECK-NEXT:    [[SUB2:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB2]]
-; CHECK-NEXT:    [[ULT:%.*]] = icmp ult i32 [[X]], -2
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ULT]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i32 1, [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
 entry:
@@ -158,18 +148,17 @@ define i32 @bit_ceil_commuted_operands(i32 %x) {
 ; CHECK-LABEL: @bit_ceil_commuted_operands(
 ; CHECK-NEXT:    [[DEC:%.*]] = add i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
-; CHECK-NEXT:    [[UGT_INV:%.*]] = icmp ugt i32 [[X]], 1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[UGT_INV]], i32 [[SHL]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i32 1, [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %dec = add i32 %x, -1
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false)
   %sub = sub i32 32, %ctlz
   %shl = shl i32 1, %sub
-  %ugt = icmp ule i32 %x, 1
-  %sel = select i1 %ugt, i32 1, i32 %shl
+  %eq = icmp eq i32 %dec, 0
+  %sel = select i1 %eq, i32 1, i32 %shl
   ret i32 %sel
 }
 
@@ -282,10 +271,9 @@ define <4 x i32> @bit_ceil_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: @bit_ceil_v4i32(
 ; CHECK-NEXT:    [[DEC:%.*]] = add <4 x i32> [[X:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[DEC]], i1 false), !range [[RNG0]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw <4 x i32> <i32 32, i32 32, i32 32, i32 32>, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[SUB]]
-; CHECK-NEXT:    [[UGT:%.*]] = icmp ugt <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[UGT]], <4 x i32> [[SHL]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <4 x i32> zeroinitializer, [[CTLZ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[TMP1]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;
   %dec = add <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>

From 4950104e243a6af2d0b9da30b415a10670a9385e Mon Sep 17 00:00:00 2001
From: XinWang10 <xin10.wang@intel.com>
Date: Thu, 23 Mar 2023 22:32:18 -0400
Subject: [PATCH 197/208] [NFC][X86]remove trailing space in
 X86InstrArithmetic.td

In this file, most of the line don't have trailing spaces,
but some of them have. To keep consistent, remove the trailing
spaces.

Reviewed By: skan

Differential Revision: https://reviews.llvm.org/D146697
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 80 +++++++++++------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 42cc7c8f4585d..46d1412aa984c 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -125,12 +125,12 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   let hasSideEffects = 0;
 }
 
-// BinOpRR_RFF_Rev - Binary instructions with inputs "reg, reg"(reversed 
+// BinOpRR_RFF_Rev - Binary instructions with inputs "reg, reg"(reversed
 // encoding), with sched = WriteADC.
 class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : BinOpRR_Rev<opcode, mnemonic, typeinfo, WriteADC>;
 
-// BinOpRR_F_Rev - Binary instructions with inputs "reg, reg"(reversed 
+// BinOpRR_F_Rev - Binary instructions with inputs "reg, reg"(reversed
 // encoding), without outlist dag.
 class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo, (outs),
@@ -184,10 +184,10 @@ class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 // has both a regclass and EFLAGS as a result, and has EFLAGS as input.
 class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode>
-  : BinOpRM_ImplicitUse<opcode, mnemonic, typeinfo, 
+  : BinOpRM_ImplicitUse<opcode, mnemonic, typeinfo,
                         (outs typeinfo.RegClass:$dst), WriteADC,
                         [(set typeinfo.RegClass:$dst, EFLAGS,
-                         (opnode typeinfo.RegClass:$src1, 
+                         (opnode typeinfo.RegClass:$src1,
                          (typeinfo.LoadNode addr:$src2), EFLAGS))]>;
 
 // BinOpRI - Binary instructions with inputs "reg, imm".
@@ -273,21 +273,21 @@ class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode>
   : BinOpMR<opcode, mnemonic, typeinfo,
             [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
-             (implicit EFLAGS)]>, 
+             (implicit EFLAGS)]>,
     Sched<[WriteALURMW,
            // base, scale, index, offset, segment
            ReadDefault, ReadDefault, ReadDefault,
            ReadDefault, ReadDefault,
            WriteALU.ReadAfterFold]>;  // reg
 
-// BinOpMR_RMW_FF - Binary instructions with inputs "[mem], reg", where the 
+// BinOpMR_RMW_FF - Binary instructions with inputs "[mem], reg", where the
 // pattern use EFLAGS as operand and implicitly use EFLAGS.
 class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                     SDNode opnode>
   : BinOpMR<opcode, mnemonic, typeinfo,
             [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
                     addr:$dst),
-             (implicit EFLAGS)]>, 
+             (implicit EFLAGS)]>,
     Sched<[WriteADCRMW,
           // base, scale, index, offset, segment
           ReadDefault, ReadDefault, ReadDefault,
@@ -321,7 +321,7 @@ class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI<opcode, mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                             typeinfo.ImmOperator:$src), addr:$dst),
-             (implicit EFLAGS)]>, 
+             (implicit EFLAGS)]>,
     Sched<[WriteALURMW]>;
 
 // BinOpMI_RMW_FF - Binary instructions with inputs "[mem], imm", where the
@@ -331,7 +331,7 @@ class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI<opcode, mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                              typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
-                             (implicit EFLAGS)]>, 
+                             (implicit EFLAGS)]>,
     Sched<[WriteADCRMW]>;
 
 // BinOpMI_F - Binary instructions with inputs "[mem], imm", where the pattern
@@ -359,7 +359,7 @@ class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src), addr:$dst),
-              (implicit EFLAGS)]>, 
+              (implicit EFLAGS)]>,
     Sched<[WriteALURMW]>;
 
 // BinOpMI8_RMW_FF - Binary instructions with inputs "[mem], imm8", where the
@@ -369,7 +369,7 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
-              (implicit EFLAGS)]>, 
+              (implicit EFLAGS)]>,
     Sched<[WriteADCRMW]>;
 
 // BinOpMI8_F - Binary instructions with inputs "[mem], imm8", where the pattern
@@ -387,7 +387,7 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               Register areg, string operands, X86FoldableSchedWrite sched = WriteALU>
   : ITy<opcode, RawFrm, typeinfo,
         (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, operands, []>, 
+        mnemonic, operands, []>,
     Sched<[sched]> {
   let ImmT = typeinfo.ImmEncoding;
   let Uses = [areg];
@@ -427,7 +427,7 @@ class UnaryOpR<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
 class INCDECR<Format f, string mnemonic, X86TypeInfo info,
               SDPatternOperator node>
   : UnaryOpR<0xFE, f, mnemonic, info,
-               [(set info.RegClass:$dst, EFLAGS, 
+               [(set info.RegClass:$dst, EFLAGS,
                (node info.RegClass:$src1, 1))]>;
 
 //  INCDECM - Instructions like "inc [mem]".
@@ -444,16 +444,16 @@ class INCDECR_ALT<bits<8> opcode, string mnemonic, X86TypeInfo info>
 }
 
 //  MulOpR - Instructions like "mul reg".
-class MulOpR<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info, 
+class MulOpR<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
              X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, info, (outs), (ins info.RegClass:$src), mnemonic, 
-        "$src", pattern>, 
+  : ITy<opcode, f, info, (outs), (ins info.RegClass:$src), mnemonic,
+        "$src", pattern>,
     Sched<[sched]>;
 
 //  MulOpM - Instructions like "mul [mem]".
-class MulOpM<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info, 
+class MulOpM<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
              X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, info, (outs), (ins info.MemOperand:$src), mnemonic, 
+  : ITy<opcode, f, info, (outs), (ins info.MemOperand:$src), mnemonic,
         "$src", pattern>, SchedLoadReg<sched>;
 
 //  NegOpR - Instructions like "neg reg", with implicit EFLAGS.
@@ -465,7 +465,7 @@ class NegOpR<bits<8> opcode, string mnemonic, X86TypeInfo info>
 //  NotOpR - Instructions like "not reg".
 class NotOpR<bits<8> opcode, string mnemonic, X86TypeInfo info>
   : UnaryOpR<opcode, MRM2r, mnemonic, info,
-               [(set info.RegClass:$dst, 
+               [(set info.RegClass:$dst,
                 (not info.RegClass:$src1))]>;
 
 //  NegOpM - Instructions like "neg [mem]", with implicit EFLAGS.
@@ -496,16 +496,16 @@ class BinOpRM_C<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
         mnemonic, "{$src2, $dst|$dst, $src2}", pattern>;
 
 // IMulOpRR - Instructions like "imul reg, reg, i8".
-class IMulOpRR<bits<8> opcode, string mnemonic, X86TypeInfo info, 
+class IMulOpRR<bits<8> opcode, string mnemonic, X86TypeInfo info,
                X86FoldableSchedWrite sched>
   : BinOpRR_C<opcode, MRMSrcReg, mnemonic, info,
               [(set info.RegClass:$dst, EFLAGS,
-                (X86smul_flag info.RegClass:$src1, 
-                info.RegClass:$src2))]>, 
+                (X86smul_flag info.RegClass:$src1,
+                info.RegClass:$src2))]>,
     Sched<[sched]>, TB;
 
 // IMulOpRM - Instructions like "imul reg, reg, [mem]".
-class IMulOpRM<bits<8> opcode, string mnemonic, X86TypeInfo info, 
+class IMulOpRM<bits<8> opcode, string mnemonic, X86TypeInfo info,
                X86FoldableSchedWrite sched>
   : BinOpRM_C<opcode, MRMSrcMem, mnemonic, info,
               [(set info.RegClass:$dst, EFLAGS,
@@ -513,57 +513,57 @@ class IMulOpRM<bits<8> opcode, string mnemonic, X86TypeInfo info,
     Sched<[sched.Folded, sched.ReadAfterFold]>, TB;
 
 // IMulOpRRI8 - Instructions like "imul reg, reg, i8".
-class IMulOpRRI8<bits<8> opcode, string mnemonic, X86TypeInfo info, 
+class IMulOpRRI8<bits<8> opcode, string mnemonic, X86TypeInfo info,
                  X86FoldableSchedWrite sched>
   : ITy<opcode, MRMSrcReg, info, (outs info.RegClass:$dst),
-               (ins info.RegClass:$src1, 
+               (ins info.RegClass:$src1,
                info.Imm8Operand:$src2), mnemonic,
                "{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set info.RegClass:$dst, EFLAGS,
-                (X86smul_flag info.RegClass:$src1, 
-                info.Imm8NoSuOperator:$src2))]>, 
+                (X86smul_flag info.RegClass:$src1,
+                info.Imm8NoSuOperator:$src2))]>,
     Sched<[sched]>{
   let ImmT = Imm8;
 }
 
 // IMulOpRRI - Instructions like "imul reg, reg, i16/i32/i64".
-class IMulOpRRI<bits<8> opcode, string mnemonic, X86TypeInfo info, 
+class IMulOpRRI<bits<8> opcode, string mnemonic, X86TypeInfo info,
                 X86FoldableSchedWrite sched>
   : ITy<opcode, MRMSrcReg, info, (outs info.RegClass:$dst),
-               (ins info.RegClass:$src1, 
-               info.ImmOperand:$src2), mnemonic, 
+               (ins info.RegClass:$src1,
+               info.ImmOperand:$src2), mnemonic,
                "{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set info.RegClass:$dst, EFLAGS,
-                (X86smul_flag info.RegClass:$src1, 
-                info.ImmNoSuOperator:$src2))]>, 
+                (X86smul_flag info.RegClass:$src1,
+                info.ImmNoSuOperator:$src2))]>,
     Sched<[sched]>{
   let ImmT = info.ImmEncoding;
 }
 
 // IMulOpRMI8 - Instructions like "imul reg, [mem], i8".
-class IMulOpRMI8<bits<8> opcode, string mnemonic, X86TypeInfo info, 
+class IMulOpRMI8<bits<8> opcode, string mnemonic, X86TypeInfo info,
                  X86FoldableSchedWrite sched>
   : ITy<opcode, MRMSrcMem, info, (outs info.RegClass:$dst),
-               (ins info.MemOperand:$src1, 
-               info.Imm8Operand:$src2), mnemonic, 
+               (ins info.MemOperand:$src1,
+               info.Imm8Operand:$src2), mnemonic,
                 "{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set info.RegClass:$dst, EFLAGS,
                 (X86smul_flag (info.LoadNode addr:$src1),
-                info.Imm8NoSuOperator:$src2))]>, 
+                info.Imm8NoSuOperator:$src2))]>,
     Sched<[sched.Folded]>{
   let ImmT = Imm8;
 }
 
 // IMulOpRMI - Instructions like "imul reg, [mem], i16/i32/i64".
-class IMulOpRMI<bits<8> opcode, string mnemonic, X86TypeInfo info, 
+class IMulOpRMI<bits<8> opcode, string mnemonic, X86TypeInfo info,
                 X86FoldableSchedWrite sched>
   : ITy<opcode, MRMSrcMem, info, (outs info.RegClass:$dst),
                (ins info.MemOperand:$src1,
-               info.ImmOperand:$src2), mnemonic, 
+               info.ImmOperand:$src2), mnemonic,
                "{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set info.RegClass:$dst, EFLAGS,
                   (X86smul_flag (info.LoadNode addr:$src1),
-                  info.ImmNoSuOperator:$src2))]>, 
+                  info.ImmNoSuOperator:$src2))]>,
     Sched<[sched.Folded]>{
   let ImmT = info.ImmEncoding;
 }
@@ -639,7 +639,7 @@ let Predicates = [UseIncDec, In64BitMode] in {
 // SDNode results (i8, i32).
 // AL,AH = AL*GR8
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def MUL8r  : MulOpR<0xF6, MRM4r, "mul", Xi8, WriteIMul8, 
+def MUL8r  : MulOpR<0xF6, MRM4r, "mul", Xi8, WriteIMul8,
                // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
                // This probably ought to be moved to a def : Pat<> if the
                // syntax can be accepted.

From 3ca6e69b6efe6ff4dc456e0ac227b292523a056f Mon Sep 17 00:00:00 2001
From: Jun Zhang <jun@junz.org>
Date: Fri, 24 Mar 2023 10:27:02 +0800
Subject: [PATCH 198/208] Precommit tests for #60690

Differential Revision: https://reviews.llvm.org/D146636

Signed-off-by: Jun Zhang <jun@junz.org>
---
 llvm/test/Transforms/InstCombine/bswap.ll | 60 +++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll
index 8c5c761c73e29..ba68e18cf7990 100644
--- a/llvm/test/Transforms/InstCombine/bswap.ll
+++ b/llvm/test/Transforms/InstCombine/bswap.ll
@@ -929,3 +929,63 @@ define i32 @PR50910(i64 %t0) {
   %t6 = trunc i64 %t5 to i32
   ret i32 %t6
 }
+
+define i64 @PR60690_call_fshl(i64 %result) {
+; CHECK-LABEL: @PR60690_call_fshl(
+; CHECK-NEXT:    [[AND_I:%.*]] = lshr i64 [[RESULT:%.*]], 8
+; CHECK-NEXT:    [[SHR_I:%.*]] = and i64 [[AND_I]], 71777214294589695
+; CHECK-NEXT:    [[AND1_I:%.*]] = shl i64 [[RESULT]], 8
+; CHECK-NEXT:    [[SHL_I:%.*]] = and i64 [[AND1_I]], -71777214294589696
+; CHECK-NEXT:    [[OR_I:%.*]] = or i64 [[SHR_I]], [[SHL_I]]
+; CHECK-NEXT:    [[AND_I7:%.*]] = shl i64 [[OR_I]], 16
+; CHECK-NEXT:    [[SHL_I8:%.*]] = and i64 [[AND_I7]], -281470681808896
+; CHECK-NEXT:    [[AND1_I9:%.*]] = lshr i64 [[OR_I]], 16
+; CHECK-NEXT:    [[SHR_I10:%.*]] = and i64 [[AND1_I9]], 281470681808895
+; CHECK-NEXT:    [[OR_I11:%.*]] = or i64 [[SHL_I8]], [[SHR_I10]]
+; CHECK-NEXT:    [[OR_I12:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR_I11]], i64 [[OR_I11]], i64 32)
+; CHECK-NEXT:    ret i64 [[OR_I12]]
+;
+  %and.i = lshr i64 %result, 8
+  %shr.i = and i64 %and.i, 71777214294589695
+  %and1.i = shl i64 %result, 8
+  %shl.i = and i64 %and1.i, -71777214294589696
+  %or.i = or i64 %shr.i, %shl.i
+  %and.i7 = shl i64 %or.i, 16
+  %shl.i8 = and i64 %and.i7, -281470681808896
+  %and1.i9 = lshr i64 %or.i, 16
+  %shr.i10 = and i64 %and1.i9, 281470681808895
+  %or.i11 = or i64 %shl.i8, %shr.i10
+  %or.i12 = tail call i64 @llvm.fshl.i64(i64 %or.i11, i64 %or.i11, i64 32)
+  ret i64 %or.i12
+}
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+
+define i64 @PR60690_call_fshr(i64 %result) {
+; CHECK-LABEL: @PR60690_call_fshr(
+; CHECK-NEXT:    [[AND_I:%.*]] = lshr i64 [[RESULT:%.*]], 8
+; CHECK-NEXT:    [[SHR_I:%.*]] = and i64 [[AND_I]], 71777214294589695
+; CHECK-NEXT:    [[AND1_I:%.*]] = shl i64 [[RESULT]], 8
+; CHECK-NEXT:    [[SHL_I:%.*]] = and i64 [[AND1_I]], -71777214294589696
+; CHECK-NEXT:    [[OR_I:%.*]] = or i64 [[SHR_I]], [[SHL_I]]
+; CHECK-NEXT:    [[AND_I7:%.*]] = shl i64 [[OR_I]], 16
+; CHECK-NEXT:    [[SHL_I8:%.*]] = and i64 [[AND_I7]], -281470681808896
+; CHECK-NEXT:    [[AND1_I9:%.*]] = lshr i64 [[OR_I]], 16
+; CHECK-NEXT:    [[SHR_I10:%.*]] = and i64 [[AND1_I9]], 281470681808895
+; CHECK-NEXT:    [[OR_I11:%.*]] = or i64 [[SHL_I8]], [[SHR_I10]]
+; CHECK-NEXT:    [[OR_I12:%.*]] = call i64 @llvm.fshl.i64(i64 [[OR_I11]], i64 [[OR_I11]], i64 32)
+; CHECK-NEXT:    ret i64 [[OR_I12]]
+;
+  %and.i = lshr i64 %result, 8
+  %shr.i = and i64 %and.i, 71777214294589695
+  %and1.i = shl i64 %result, 8
+  %shl.i = and i64 %and1.i, -71777214294589696
+  %or.i = or i64 %shr.i, %shl.i
+  %and.i7 = shl i64 %or.i, 16
+  %shl.i8 = and i64 %and.i7, -281470681808896
+  %and1.i9 = lshr i64 %or.i, 16
+  %shr.i10 = and i64 %and1.i9, 281470681808895
+  %or.i11 = or i64 %shl.i8, %shr.i10
+  %or.i12 = tail call i64 @llvm.fshr.i64(i64 %or.i11, i64 %or.i11, i64 32)
+  ret i64 %or.i12
+}
+declare i64 @llvm.fshr.i64(i64, i64, i64)

From cea938390ea77d494b77d399ed440c0c76ef3849 Mon Sep 17 00:00:00 2001
From: Jun Zhang <jun@junz.org>
Date: Fri, 24 Mar 2023 10:28:02 +0800
Subject: [PATCH 199/208] [InstCombine] Try to recognize bswap pattern when
 calling funnel shifts

Alive2: https://alive2.llvm.org/ce/z/dxxD7B
Fixes: https://github.com/llvm/llvm-project/issues/60690

Signed-off-by: Jun Zhang <jun@junz.org>

Differential Revision: https://reviews.llvm.org/D146637
---
 .../InstCombine/InstCombineCalls.cpp          |  4 ++++
 llvm/test/Transforms/InstCombine/bswap.ll     | 24 ++-----------------
 llvm/test/Transforms/InstCombine/fsh.ll       |  9 +++----
 3 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 0fbd62e8a41c0..0708fb44b982b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1795,6 +1795,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty);
         return CallInst::Create(Bswap, { Op0 });
       }
+      if (Instruction *BitOp =
+              matchBSwapOrBitReverse(*II, /*MatchBSwaps*/ true,
+                                     /*MatchBitReversals*/ true))
+        return BitOp;
     }
 
     // Left or right might be masked.
diff --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll
index ba68e18cf7990..bb70b4e0c1be2 100644
--- a/llvm/test/Transforms/InstCombine/bswap.ll
+++ b/llvm/test/Transforms/InstCombine/bswap.ll
@@ -932,17 +932,7 @@ define i32 @PR50910(i64 %t0) {
 
 define i64 @PR60690_call_fshl(i64 %result) {
 ; CHECK-LABEL: @PR60690_call_fshl(
-; CHECK-NEXT:    [[AND_I:%.*]] = lshr i64 [[RESULT:%.*]], 8
-; CHECK-NEXT:    [[SHR_I:%.*]] = and i64 [[AND_I]], 71777214294589695
-; CHECK-NEXT:    [[AND1_I:%.*]] = shl i64 [[RESULT]], 8
-; CHECK-NEXT:    [[SHL_I:%.*]] = and i64 [[AND1_I]], -71777214294589696
-; CHECK-NEXT:    [[OR_I:%.*]] = or i64 [[SHR_I]], [[SHL_I]]
-; CHECK-NEXT:    [[AND_I7:%.*]] = shl i64 [[OR_I]], 16
-; CHECK-NEXT:    [[SHL_I8:%.*]] = and i64 [[AND_I7]], -281470681808896
-; CHECK-NEXT:    [[AND1_I9:%.*]] = lshr i64 [[OR_I]], 16
-; CHECK-NEXT:    [[SHR_I10:%.*]] = and i64 [[AND1_I9]], 281470681808895
-; CHECK-NEXT:    [[OR_I11:%.*]] = or i64 [[SHL_I8]], [[SHR_I10]]
-; CHECK-NEXT:    [[OR_I12:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR_I11]], i64 [[OR_I11]], i64 32)
+; CHECK-NEXT:    [[OR_I12:%.*]] = call i64 @llvm.bswap.i64(i64 [[RESULT:%.*]])
 ; CHECK-NEXT:    ret i64 [[OR_I12]]
 ;
   %and.i = lshr i64 %result, 8
@@ -962,17 +952,7 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
 
 define i64 @PR60690_call_fshr(i64 %result) {
 ; CHECK-LABEL: @PR60690_call_fshr(
-; CHECK-NEXT:    [[AND_I:%.*]] = lshr i64 [[RESULT:%.*]], 8
-; CHECK-NEXT:    [[SHR_I:%.*]] = and i64 [[AND_I]], 71777214294589695
-; CHECK-NEXT:    [[AND1_I:%.*]] = shl i64 [[RESULT]], 8
-; CHECK-NEXT:    [[SHL_I:%.*]] = and i64 [[AND1_I]], -71777214294589696
-; CHECK-NEXT:    [[OR_I:%.*]] = or i64 [[SHR_I]], [[SHL_I]]
-; CHECK-NEXT:    [[AND_I7:%.*]] = shl i64 [[OR_I]], 16
-; CHECK-NEXT:    [[SHL_I8:%.*]] = and i64 [[AND_I7]], -281470681808896
-; CHECK-NEXT:    [[AND1_I9:%.*]] = lshr i64 [[OR_I]], 16
-; CHECK-NEXT:    [[SHR_I10:%.*]] = and i64 [[AND1_I9]], 281470681808895
-; CHECK-NEXT:    [[OR_I11:%.*]] = or i64 [[SHL_I8]], [[SHR_I10]]
-; CHECK-NEXT:    [[OR_I12:%.*]] = call i64 @llvm.fshl.i64(i64 [[OR_I11]], i64 [[OR_I11]], i64 32)
+; CHECK-NEXT:    [[OR_I12:%.*]] = call i64 @llvm.bswap.i64(i64 [[RESULT:%.*]])
 ; CHECK-NEXT:    ret i64 [[OR_I12]]
 ;
   %and.i = lshr i64 %result, 8
diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll
index 7d3f3948511d1..489f6e686680b 100644
--- a/llvm/test/Transforms/InstCombine/fsh.ll
+++ b/llvm/test/Transforms/InstCombine/fsh.ll
@@ -672,8 +672,9 @@ define i32 @fshl_mask_args_same1(i32 %a) {
 
 define i32 @fshl_mask_args_same2(i32 %a) {
 ; CHECK-LABEL: @fshl_mask_args_same2(
-; CHECK-NEXT:    [[T1:%.*]] = shl i32 [[A:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], 65280
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[A:%.*]] to i16
+; CHECK-NEXT:    [[REV:%.*]] = shl i16 [[TRUNC]], 8
+; CHECK-NEXT:    [[T2:%.*]] = zext i16 [[REV]] to i32
 ; CHECK-NEXT:    ret i32 [[T2]]
 ;
   %t1 = and i32 %a, 255
@@ -683,8 +684,8 @@ define i32 @fshl_mask_args_same2(i32 %a) {
 
 define i32 @fshl_mask_args_same3(i32 %a) {
 ; CHECK-LABEL: @fshl_mask_args_same3(
-; CHECK-NEXT:    [[T2:%.*]] = shl i32 [[A:%.*]], 24
-; CHECK-NEXT:    ret i32 [[T2]]
+; CHECK-NEXT:    [[REV:%.*]] = shl i32 [[A:%.*]], 24
+; CHECK-NEXT:    ret i32 [[REV]]
 ;
   %t1 = and i32 %a, 255
   %t2 = call i32 @llvm.fshl.i32(i32 %t1, i32 %t1, i32 24)

From 11674147e40699202132440313032528dfbf624f Mon Sep 17 00:00:00 2001
From: Xiaodong Liu <liuxiaodong@loongson.cn>
Date: Fri, 24 Mar 2023 11:08:21 +0800
Subject: [PATCH 200/208] [LoongArch] Enable LoopDataPrefetch pass

Keep `EnableLoopDataPrefetch` option off for now because
we need a few more TTIs and ISels.

This patch is inspired by http://reviews.llvm.org/D17943.

Reviewed By: SixWeining

Differential Revision: https://reviews.llvm.org/D146600
---
 .../LoongArch/LoongArchTargetMachine.cpp      | 12 +++++++++
 .../LoopDataPrefetch/LoongArch/basic.ll       | 25 +++++++++++++++++++
 .../LoopDataPrefetch/LoongArch/lit.local.cfg  |  2 ++
 3 files changed, 39 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopDataPrefetch/LoongArch/basic.ll
 create mode 100644 llvm/test/Transforms/LoopDataPrefetch/LoongArch/lit.local.cfg

diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 933ba3b40ce40..504019c2a09e8 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
 #include <optional>
 
 using namespace llvm;
@@ -34,6 +35,11 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
   initializeLoongArchDAGToDAGISelPass(*PR);
 }
 
+static cl::opt<bool>
+    EnableLoopDataPrefetch("loongarch-enable-loop-data-prefetch", cl::Hidden,
+                           cl::desc("Enable the loop data prefetch pass"),
+                           cl::init(false));
+
 static std::string computeDataLayout(const Triple &TT) {
   if (TT.isArch64Bit())
     return "e-m:e-p:64:64-i64:64-i128:128-n64-S128";
@@ -126,6 +132,12 @@ LoongArchTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void LoongArchPassConfig::addIRPasses() {
+  // Run LoopDataPrefetch
+  //
+  // Run this before LSR to remove the multiplies involved in computing the
+  // pointer values N iterations ahead.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
+    addPass(createLoopDataPrefetchPass());
   addPass(createAtomicExpandPass());
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/test/Transforms/LoopDataPrefetch/LoongArch/basic.ll b/llvm/test/Transforms/LoopDataPrefetch/LoongArch/basic.ll
new file mode 100644
index 0000000000000..55a2a2970d2d7
--- /dev/null
+++ b/llvm/test/Transforms/LoopDataPrefetch/LoongArch/basic.ll
@@ -0,0 +1,25 @@
+;; Tag this 'XFAIL' because we need a few more TTIs and ISels.
+; XFAIL: *
+; RUN: opt --mtriple=loongarch64 --passes=loop-data-prefetch -loongarch-enable-loop-data-prefetch -S < %s | FileCheck %s
+
+define void @foo(ptr %a, ptr %b) {
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %b, i64 %indvars.iv
+; CHECK: call void @llvm.prefetch
+  %0 = load double, ptr %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  store double %add, ptr %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK: for.end:
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopDataPrefetch/LoongArch/lit.local.cfg b/llvm/test/Transforms/LoopDataPrefetch/LoongArch/lit.local.cfg
new file mode 100644
index 0000000000000..2b5a4893e686f
--- /dev/null
+++ b/llvm/test/Transforms/LoopDataPrefetch/LoongArch/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'LoongArch' in config.root.targets:
+    config.unsupported = True

From d8efbcf9dcbb413fa3d3e66173f4630989e5588c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 23 Mar 2023 20:20:20 -0700
Subject: [PATCH 201/208] [AArch64] Add tests for umax(x, 1u)

This patch adds tests for umax(x, 1u).

This patch fixes:

https://github.com/llvm/llvm-project/issues/60233

It turns out that commit 86b4d8645fc1b86693fef564cef68f24599c930f on
Feb 8, 2023 already performs the instcombine transformation proposed
in the issue, so the issue requires no change on the codegen side.
---
 llvm/test/CodeGen/AArch64/min-max.ll | 112 +++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/min-max.ll b/llvm/test/CodeGen/AArch64/min-max.ll
index 63d5632b50821..8914406f1db73 100644
--- a/llvm/test/CodeGen/AArch64/min-max.ll
+++ b/llvm/test/CodeGen/AArch64/min-max.ll
@@ -428,6 +428,36 @@ define i8 @umaxi8(i8 %a, i8 %b) {
   ret i8 %c
 }
 
+define i8 @umaxi8_1(i8 %a) {
+; CHECK-ISEL-LABEL: umaxi8_1:
+; CHECK-ISEL:       // %bb.0:
+; CHECK-ISEL-NEXT:    and w8, w0, #0xff
+; CHECK-ISEL-NEXT:    tst w0, #0xfe
+; CHECK-ISEL-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-ISEL-NEXT:    ret
+;
+; CHECK-ISEL-CSSC-LABEL: umaxi8_1:
+; CHECK-ISEL-CSSC:       // %bb.0:
+; CHECK-ISEL-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-ISEL-CSSC-NEXT:    umax w0, w8, #1
+; CHECK-ISEL-CSSC-NEXT:    ret
+;
+; CHECK-GLOBAL-LABEL: umaxi8_1:
+; CHECK-GLOBAL:       // %bb.0:
+; CHECK-GLOBAL-NEXT:    and w8, w0, #0xff
+; CHECK-GLOBAL-NEXT:    cmp w8, #1
+; CHECK-GLOBAL-NEXT:    csinc w0, w0, wzr, hi
+; CHECK-GLOBAL-NEXT:    ret
+;
+; CHECK-GLOBAL-CSSC-LABEL: umaxi8_1:
+; CHECK-GLOBAL-CSSC:       // %bb.0:
+; CHECK-GLOBAL-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-GLOBAL-CSSC-NEXT:    umax w0, w8, #1
+; CHECK-GLOBAL-CSSC-NEXT:    ret
+  %c = call i8 @llvm.umax.i8(i8 %a, i8 1)
+  ret i8 %c
+}
+
 declare i16 @llvm.umax.i16(i16 %a, i16 %b) readnone
 
 define i16 @umaxi16(i16 %a, i16 %b) {
@@ -463,6 +493,36 @@ define i16 @umaxi16(i16 %a, i16 %b) {
   ret i16 %c
 }
 
+define i16 @umaxi16_1(i16 %a) {
+; CHECK-ISEL-LABEL: umaxi16_1:
+; CHECK-ISEL:       // %bb.0:
+; CHECK-ISEL-NEXT:    and w8, w0, #0xffff
+; CHECK-ISEL-NEXT:    tst w0, #0xfffe
+; CHECK-ISEL-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-ISEL-NEXT:    ret
+;
+; CHECK-ISEL-CSSC-LABEL: umaxi16_1:
+; CHECK-ISEL-CSSC:       // %bb.0:
+; CHECK-ISEL-CSSC-NEXT:    and w8, w0, #0xffff
+; CHECK-ISEL-CSSC-NEXT:    umax w0, w8, #1
+; CHECK-ISEL-CSSC-NEXT:    ret
+;
+; CHECK-GLOBAL-LABEL: umaxi16_1:
+; CHECK-GLOBAL:       // %bb.0:
+; CHECK-GLOBAL-NEXT:    and w8, w0, #0xffff
+; CHECK-GLOBAL-NEXT:    cmp w8, #1
+; CHECK-GLOBAL-NEXT:    csinc w0, w0, wzr, hi
+; CHECK-GLOBAL-NEXT:    ret
+;
+; CHECK-GLOBAL-CSSC-LABEL: umaxi16_1:
+; CHECK-GLOBAL-CSSC:       // %bb.0:
+; CHECK-GLOBAL-CSSC-NEXT:    and w8, w0, #0xffff
+; CHECK-GLOBAL-CSSC-NEXT:    umax w0, w8, #1
+; CHECK-GLOBAL-CSSC-NEXT:    ret
+  %c = call i16 @llvm.umax.i16(i16 %a, i16 1)
+  ret i16 %c
+}
+
 declare i32 @llvm.umax.i32(i32 %a, i32 %b) readnone
 
 define i32 @umaxi32(i32 %a, i32 %b) {
@@ -491,6 +551,32 @@ define i32 @umaxi32(i32 %a, i32 %b) {
   ret i32 %c
 }
 
+define i32 @umaxi32_1(i32 %a) {
+; CHECK-ISEL-LABEL: umaxi32_1:
+; CHECK-ISEL:       // %bb.0:
+; CHECK-ISEL-NEXT:    cmp w0, #1
+; CHECK-ISEL-NEXT:    csinc w0, w0, wzr, hi
+; CHECK-ISEL-NEXT:    ret
+;
+; CHECK-ISEL-CSSC-LABEL: umaxi32_1:
+; CHECK-ISEL-CSSC:       // %bb.0:
+; CHECK-ISEL-CSSC-NEXT:    umax w0, w0, #1
+; CHECK-ISEL-CSSC-NEXT:    ret
+;
+; CHECK-GLOBAL-LABEL: umaxi32_1:
+; CHECK-GLOBAL:       // %bb.0:
+; CHECK-GLOBAL-NEXT:    cmp w0, #1
+; CHECK-GLOBAL-NEXT:    csinc w0, w0, wzr, hi
+; CHECK-GLOBAL-NEXT:    ret
+;
+; CHECK-GLOBAL-CSSC-LABEL: umaxi32_1:
+; CHECK-GLOBAL-CSSC:       // %bb.0:
+; CHECK-GLOBAL-CSSC-NEXT:    umax w0, w0, #1
+; CHECK-GLOBAL-CSSC-NEXT:    ret
+  %c = call i32 @llvm.umax.i32(i32 %a, i32 1)
+  ret i32 %c
+}
+
 declare i64 @llvm.umax.i64(i64 %a, i64 %b) readnone
 
 define i64 @umaxi64(i64 %a, i64 %b) {
@@ -519,6 +605,32 @@ define i64 @umaxi64(i64 %a, i64 %b) {
   ret i64 %c
 }
 
+define i64 @umaxi64_1(i64 %a) {
+; CHECK-ISEL-LABEL: umaxi64_1:
+; CHECK-ISEL:       // %bb.0:
+; CHECK-ISEL-NEXT:    cmp x0, #1
+; CHECK-ISEL-NEXT:    csinc x0, x0, xzr, hi
+; CHECK-ISEL-NEXT:    ret
+;
+; CHECK-ISEL-CSSC-LABEL: umaxi64_1:
+; CHECK-ISEL-CSSC:       // %bb.0:
+; CHECK-ISEL-CSSC-NEXT:    umax x0, x0, #1
+; CHECK-ISEL-CSSC-NEXT:    ret
+;
+; CHECK-GLOBAL-LABEL: umaxi64_1:
+; CHECK-GLOBAL:       // %bb.0:
+; CHECK-GLOBAL-NEXT:    cmp x0, #1
+; CHECK-GLOBAL-NEXT:    csinc x0, x0, xzr, hi
+; CHECK-GLOBAL-NEXT:    ret
+;
+; CHECK-GLOBAL-CSSC-LABEL: umaxi64_1:
+; CHECK-GLOBAL-CSSC:       // %bb.0:
+; CHECK-GLOBAL-CSSC-NEXT:    umax x0, x0, #1
+; CHECK-GLOBAL-CSSC-NEXT:    ret
+  %c = call i64 @llvm.umax.i64(i64 %a, i64 1)
+  ret i64 %c
+}
+
 declare <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b) readnone
 
 define <8 x i8> @umax8i8(<8 x i8> %a, <8 x i8> %b) {

From cc86e6b0a88be33a8b74b5f0431424909dec9feb Mon Sep 17 00:00:00 2001
From: Xiang1 Zhang <xiang1.zhang@intel.com>
Date: Tue, 21 Mar 2023 17:33:54 +0800
Subject: [PATCH 202/208] [BugFix] Fix VSELECT ISel fail

Reviewed By: Luo yuanke

Differential Revision: https://reviews.llvm.org/D146683
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 ++++++++----
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |  2 ++
 llvm/test/CodeGen/X86/vselect-post-combine.ll | 24 +++++++++++++++++++
 3 files changed, 36 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/vselect-post-combine.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cc722bcc8c2b3..20d7447802c8a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12210,7 +12210,8 @@ static bool isCompatibleLoad(SDValue N, unsigned ExtOpcode) {
 /// This function is called by the DAGCombiner when visiting sext/zext/aext
 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
 static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
-                                         SelectionDAG &DAG) {
+                                         SelectionDAG &DAG,
+                                         CombineLevel Level) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -12235,10 +12236,14 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
   else if (Opcode == ISD::ZERO_EXTEND)
     ExtLoadOpcode = ISD::ZEXTLOAD;
 
+  // Illegal VSELECT may ISel fail if happen after legalization (DAG
+  // Combine2), so we should conservatively check the OperationAction.
   LoadSDNode *Load1 = cast<LoadSDNode>(Op1);
   LoadSDNode *Load2 = cast<LoadSDNode>(Op2);
   if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT()) ||
-      !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT()))
+      !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT()) ||
+      (N0->getOpcode() == ISD::VSELECT && Level >= AfterLegalizeTypes &&
+       TLI.getOperationAction(ISD::VSELECT, VT) != TargetLowering::Legal))
     return SDValue();
 
   SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1);
@@ -13106,7 +13111,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
   }
 
-  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
     return Res;
 
   return SDValue();
@@ -13457,7 +13462,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (SDValue V = widenAbs(N, DAG))
     return V;
 
-  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
     return Res;
 
   return SDValue();
@@ -13618,7 +13623,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   if (SDValue NewCtPop = widenCtPop(N, DAG))
     return NewCtPop;
 
-  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
     return Res;
 
   return SDValue();
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5e90a94819b6b..dfac24935e244 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1039,6 +1039,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
         break;
 
       assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+      assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
+             "We can't replace VSELECT with BLENDV in vXi16!");
       SDValue Blendv =
           CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
                           N->getOperand(0), N->getOperand(1), N->getOperand(2));
diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll
new file mode 100644
index 0000000000000..fdbc361e85d22
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+define ptr @test_mul(ptr %addr) {
+; AVX2-LABEL: test_mul:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpblendvb %xmm0, (%rdi), %xmm1, %xmm0
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vmovdqu %ymm0, 0
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+entry:
+  %vec0 = load <32 x i8>, ptr %addr
+  %vec1 = shufflevector <32 x i8> %vec0, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %0 = bitcast <32 x i8> %vec1 to <4 x i64>
+  %shuffle = shufflevector <4 x i64> %0, <4 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %shuffle to <16 x i8>
+  %conv = zext <16 x i8> %1 to <16 x i16>
+  store <16 x i16> %conv, ptr null, align 1
+  ret ptr null
+}

From abddb8359895a2040a3439850f5c8c9c61123947 Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Thu, 23 Mar 2023 22:14:10 -0700
Subject: [PATCH 203/208] [lldb] Fix type of --apply-fixits (NFC)

---
 lldb/source/Commands/Options.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index f11c95e5660e2..ea917f78841bb 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -371,7 +371,7 @@ let Command = "expression" in {
     Arg<"Language">, Desc<"Specifies the Language to use when parsing the "
     "expression.  If not set the target.language setting is used.">;
   def expression_options_apply_fixits : Option<"apply-fixits", "X">,
-    Groups<[1,2]>, Arg<"Language">, Desc<"If true, simple fix-it hints will be "
+    Groups<[1,2]>, Arg<"Boolean">, Desc<"If true, simple fix-it hints will be "
     "automatically applied to the expression.">;
   def expression_options_description_verbosity :
     Option<"description-verbosity", "v">, Group<1>,

From 1e4325f30c2494637626a978c54b41c8ca7ec0ff Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 23 Mar 2023 23:48:17 -0700
Subject: [PATCH 204/208] [X86] Precommit a test

This patch precommits a test for:

https://github.com/llvm/llvm-project/issues/61365
---
 llvm/test/CodeGen/X86/select-constant-lea.ll | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/select-constant-lea.ll

diff --git a/llvm/test/CodeGen/X86/select-constant-lea.ll b/llvm/test/CodeGen/X86/select-constant-lea.ll
new file mode 100644
index 0000000000000..e8472053353cc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/select-constant-lea.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown                      | FileCheck %s --check-prefix=BASE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-3ops-lea | FileCheck %s --check-prefix=SLOWLEA3
+
+define i32 @select_unsigned_lt_10_8_13j(i32 %0) {
+; BASE-LABEL: select_unsigned_lt_10_8_13j:
+; BASE:       # %bb.0:
+; BASE-NEXT:    xorl %eax, %eax
+; BASE-NEXT:    cmpl $10, %edi
+; BASE-NEXT:    setae %al
+; BASE-NEXT:    leal (%rax,%rax,4), %eax
+; BASE-NEXT:    orl $8, %eax
+; BASE-NEXT:    retq
+;
+; SLOWLEA3-LABEL: select_unsigned_lt_10_8_13j:
+; SLOWLEA3:       # %bb.0:
+; SLOWLEA3-NEXT:    xorl %eax, %eax
+; SLOWLEA3-NEXT:    cmpl $10, %edi
+; SLOWLEA3-NEXT:    setae %al
+; SLOWLEA3-NEXT:    leal (%rax,%rax,4), %eax
+; SLOWLEA3-NEXT:    orl $8, %eax
+; SLOWLEA3-NEXT:    retq
+  %2 = icmp ult i32 %0, 10
+  %3 = select i1 %2, i32 8, i32 13
+  ret i32 %3
+}

From d30bc9e91241d69410fe1a878a66438dd752014f Mon Sep 17 00:00:00 2001
From: Michael Platings <michael.platings@arm.com>
Date: Tue, 14 Mar 2023 19:40:58 +0000
Subject: [PATCH 205/208] [Driver] Change multilib selection algorithm

The new algorithm is:
1. Find all multilibs with flags that are a subset of the requested
   flags.
2. If more than one multilib matches, choose the last.

In addition a new selection mechanism is permitted via an overload of
MultilibSet::select() for which multiple multilibs are returned.
This allows layering multilibs on top of each other.

Since multilibs are now ordered within a list, they no longer need a
Priority field.

The new algorithm is different to the old algorithm, but in practise
the old algorithm was always used in such a way that the effect is the
same.
The old algorithm was to find the set intersection of the requested
flags (with the first character of each removed) with each multilib's
flags (ditto), and for that intersection check whether the first
character matched. However, ignoring the first characters, the
requested flags were always a superset of all the multilibs flags.
Therefore the new algorithm can be used as a drop-in replacement.

The exception is Fuchsia, which needs adjusting slightly to set both
fexceptions and fno-exceptions flags.

Differential Revision: https://reviews.llvm.org/D142905
---
 clang/include/clang/Driver/Multilib.h        | 17 +----
 clang/include/clang/Driver/MultilibBuilder.h |  7 +-
 clang/lib/Driver/Multilib.cpp                | 80 ++++++--------------
 clang/lib/Driver/MultilibBuilder.cpp         |  7 +-
 clang/lib/Driver/ToolChains/Fuchsia.cpp      | 19 ++---
 clang/lib/Driver/ToolChains/OHOS.cpp         | 14 ++--
 clang/unittests/Driver/MultilibTest.cpp      | 37 +++++++--
 7 files changed, 79 insertions(+), 102 deletions(-)

diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h
index feb12f3638d34..9d6f1d23696b8 100644
--- a/clang/include/clang/Driver/Multilib.h
+++ b/clang/include/clang/Driver/Multilib.h
@@ -36,14 +36,13 @@ class Multilib {
   std::string OSSuffix;
   std::string IncludeSuffix;
   flags_list Flags;
-  int Priority;
 
 public:
   /// GCCSuffix, OSSuffix & IncludeSuffix will be appended directly to the
   /// sysroot string so they must either be empty or begin with a '/' character.
   /// This is enforced with an assert in the constructor.
   Multilib(StringRef GCCSuffix = {}, StringRef OSSuffix = {},
-           StringRef IncludeSuffix = {}, int Priority = 0,
+           StringRef IncludeSuffix = {},
            const flags_list &Flags = flags_list());
 
   /// Get the detected GCC installation path suffix for the multi-arch
@@ -62,10 +61,6 @@ class Multilib {
   /// All elements begin with either '+' or '-'
   const flags_list &flags() const { return Flags; }
 
-  /// Returns the multilib priority. When more than one multilib matches flags,
-  /// the one with the highest priority is selected, with 0 being the default.
-  int priority() const { return Priority; }
-
   LLVM_DUMP_METHOD void dump() const;
   /// print summary of the Multilib
   void print(raw_ostream &OS) const;
@@ -108,6 +103,9 @@ class MultilibSet {
   const_iterator begin() const { return Multilibs.begin(); }
   const_iterator end() const { return Multilibs.end(); }
 
+  /// Select compatible variants
+  multilib_list select(const Multilib::flags_list &Flags) const;
+
   /// Pick the best multilib in the set, \returns false if none are compatible
   bool select(const Multilib::flags_list &Flags, Multilib &M) const;
 
@@ -129,13 +127,6 @@ class MultilibSet {
   }
 
   const IncludeDirsFunc &filePathsCallback() const { return FilePathsCallback; }
-
-private:
-  /// Apply the filter to Multilibs and return the subset that remains
-  static multilib_list filterCopy(FilterCallback F, const multilib_list &Ms);
-
-  /// Apply the filter to the multilib_list, removing those that don't match
-  static void filterInPlace(FilterCallback F, multilib_list &Ms);
 };
 
 raw_ostream &operator<<(raw_ostream &OS, const MultilibSet &MS);
diff --git a/clang/include/clang/Driver/MultilibBuilder.h b/clang/include/clang/Driver/MultilibBuilder.h
index cf84c456152b1..f4875f2e03f8a 100644
--- a/clang/include/clang/Driver/MultilibBuilder.h
+++ b/clang/include/clang/Driver/MultilibBuilder.h
@@ -28,11 +28,10 @@ class MultilibBuilder {
   std::string OSSuffix;
   std::string IncludeSuffix;
   flags_list Flags;
-  int Priority;
 
 public:
   MultilibBuilder(StringRef GCCSuffix, StringRef OSSuffix,
-                  StringRef IncludeSuffix, int Priority = 0);
+                  StringRef IncludeSuffix);
 
   /// Initializes GCCSuffix, OSSuffix & IncludeSuffix to the same value.
   MultilibBuilder(StringRef Suffix = {});
@@ -75,10 +74,6 @@ class MultilibBuilder {
   const flags_list &flags() const { return Flags; }
   flags_list &flags() { return Flags; }
 
-  /// Returns the multilib priority. When more than one multilib matches flags,
-  /// the one with the highest priority is selected, with 0 being the default.
-  int priority() const { return Priority; }
-
   /// Add a flag to the flags list
   /// \p Flag must be a flag accepted by the driver with its leading '-'
   /// removed,
diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp
index d1ab0c7b114e9..06bab74898616 100644
--- a/clang/lib/Driver/Multilib.cpp
+++ b/clang/lib/Driver/Multilib.cpp
@@ -26,10 +26,9 @@ using namespace driver;
 using namespace llvm::sys;
 
 Multilib::Multilib(StringRef GCCSuffix, StringRef OSSuffix,
-                   StringRef IncludeSuffix, int Priority,
-                   const flags_list &Flags)
+                   StringRef IncludeSuffix, const flags_list &Flags)
     : GCCSuffix(GCCSuffix), OSSuffix(OSSuffix), IncludeSuffix(IncludeSuffix),
-      Flags(Flags), Priority(Priority) {
+      Flags(Flags) {
   assert(GCCSuffix.empty() ||
          (StringRef(GCCSuffix).front() == '/' && GCCSuffix.size() > 1));
   assert(OSSuffix.empty() ||
@@ -84,56 +83,36 @@ raw_ostream &clang::driver::operator<<(raw_ostream &OS, const Multilib &M) {
 }
 
 MultilibSet &MultilibSet::FilterOut(FilterCallback F) {
-  filterInPlace(F, Multilibs);
+  llvm::erase_if(Multilibs, F);
   return *this;
 }
 
 void MultilibSet::push_back(const Multilib &M) { Multilibs.push_back(M); }
 
-static bool isFlagEnabled(StringRef Flag) {
-  char Indicator = Flag.front();
-  assert(Indicator == '+' || Indicator == '-');
-  return Indicator == '+';
+MultilibSet::multilib_list
+MultilibSet::select(const Multilib::flags_list &Flags) const {
+  llvm::StringSet<> FlagSet;
+  for (const auto &Flag : Flags)
+    FlagSet.insert(Flag);
+
+  multilib_list Result;
+  llvm::copy_if(Multilibs, std::back_inserter(Result),
+                [&FlagSet](const Multilib &M) {
+                  for (const std::string &F : M.flags())
+                    if (!FlagSet.contains(F))
+                      return false;
+                  return true;
+                });
+  return Result;
 }
 
-bool MultilibSet::select(const Multilib::flags_list &Flags, Multilib &M) const {
-  llvm::StringMap<bool> FlagSet;
-
-  // Stuff all of the flags into the FlagSet such that a true mappend indicates
-  // the flag was enabled, and a false mappend indicates the flag was disabled.
-  for (StringRef Flag : Flags)
-    FlagSet[Flag.substr(1)] = isFlagEnabled(Flag);
-
-  multilib_list Filtered = filterCopy([&FlagSet](const Multilib &M) {
-    for (StringRef Flag : M.flags()) {
-      llvm::StringMap<bool>::const_iterator SI = FlagSet.find(Flag.substr(1));
-      if (SI != FlagSet.end())
-        if (SI->getValue() != isFlagEnabled(Flag))
-          return true;
-    }
-    return false;
-  }, Multilibs);
-
-  if (Filtered.empty())
+bool MultilibSet::select(const Multilib::flags_list &Flags,
+                         Multilib &Selected) const {
+  multilib_list Result = select(Flags);
+  if (Result.empty())
     return false;
-  if (Filtered.size() == 1) {
-    M = Filtered[0];
-    return true;
-  }
-
-  // Sort multilibs by priority and select the one with the highest priority.
-  llvm::sort(Filtered, [](const Multilib &a, const Multilib &b) -> bool {
-    return a.priority() > b.priority();
-  });
-
-  if (Filtered[0].priority() > Filtered[1].priority()) {
-    M = Filtered[0];
-    return true;
-  }
-
-  // TODO: We should consider returning llvm::Error rather than aborting.
-  assert(false && "More than one multilib with the same priority");
-  return false;
+  Selected = Result.back();
+  return true;
 }
 
 LLVM_DUMP_METHOD void MultilibSet::dump() const {
@@ -145,17 +124,6 @@ void MultilibSet::print(raw_ostream &OS) const {
     OS << M << "\n";
 }
 
-MultilibSet::multilib_list MultilibSet::filterCopy(FilterCallback F,
-                                                   const multilib_list &Ms) {
-  multilib_list Copy(Ms);
-  filterInPlace(F, Copy);
-  return Copy;
-}
-
-void MultilibSet::filterInPlace(FilterCallback F, multilib_list &Ms) {
-  llvm::erase_if(Ms, F);
-}
-
 raw_ostream &clang::driver::operator<<(raw_ostream &OS, const MultilibSet &MS) {
   MS.print(OS);
   return OS;
diff --git a/clang/lib/Driver/MultilibBuilder.cpp b/clang/lib/Driver/MultilibBuilder.cpp
index 83ebc31d8eb99..f6351ae4b5278 100644
--- a/clang/lib/Driver/MultilibBuilder.cpp
+++ b/clang/lib/Driver/MultilibBuilder.cpp
@@ -41,9 +41,8 @@ static void normalizePathSegment(std::string &Segment) {
   }
 }
 
-MultilibBuilder::MultilibBuilder(StringRef GCC, StringRef OS, StringRef Include,
-                                 int Priority)
-    : GCCSuffix(GCC), OSSuffix(OS), IncludeSuffix(Include), Priority(Priority) {
+MultilibBuilder::MultilibBuilder(StringRef GCC, StringRef OS, StringRef Include)
+    : GCCSuffix(GCC), OSSuffix(OS), IncludeSuffix(Include) {
   normalizePathSegment(GCCSuffix);
   normalizePathSegment(OSSuffix);
   normalizePathSegment(IncludeSuffix);
@@ -87,7 +86,7 @@ bool MultilibBuilder::isValid() const {
 }
 
 Multilib MultilibBuilder::makeMultilib() const {
-  return Multilib(GCCSuffix, OSSuffix, IncludeSuffix, Priority, Flags);
+  return Multilib(GCCSuffix, OSSuffix, IncludeSuffix, Flags);
 }
 
 MultilibSetBuilder &MultilibSetBuilder::Maybe(const MultilibBuilder &M) {
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 3a3f7043a795f..b8bb000391b91 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -263,33 +263,33 @@ Fuchsia::Fuchsia(const Driver &D, const llvm::Triple &Triple,
 
   Multilibs.push_back(Multilib());
   // Use the noexcept variant with -fno-exceptions to avoid the extra overhead.
-  Multilibs.push_back(MultilibBuilder("noexcept", {}, {}, 1)
+  Multilibs.push_back(MultilibBuilder("noexcept", {}, {})
                           .flag("-fexceptions")
                           .flag("+fno-exceptions")
                           .makeMultilib());
   // ASan has higher priority because we always want the instrumentated version.
-  Multilibs.push_back(MultilibBuilder("asan", {}, {}, 2)
+  Multilibs.push_back(MultilibBuilder("asan", {}, {})
                           .flag("+fsanitize=address")
                           .makeMultilib());
   // Use the asan+noexcept variant with ASan and -fno-exceptions.
-  Multilibs.push_back(MultilibBuilder("asan+noexcept", {}, {}, 3)
+  Multilibs.push_back(MultilibBuilder("asan+noexcept", {}, {})
                           .flag("+fsanitize=address")
                           .flag("-fexceptions")
                           .flag("+fno-exceptions")
                           .makeMultilib());
   // HWASan has higher priority because we always want the instrumentated
   // version.
-  Multilibs.push_back(MultilibBuilder("hwasan", {}, {}, 4)
+  Multilibs.push_back(MultilibBuilder("hwasan", {}, {})
                           .flag("+fsanitize=hwaddress")
                           .makeMultilib());
   // Use the hwasan+noexcept variant with HWASan and -fno-exceptions.
-  Multilibs.push_back(MultilibBuilder("hwasan+noexcept", {}, {}, 5)
+  Multilibs.push_back(MultilibBuilder("hwasan+noexcept", {}, {})
                           .flag("+fsanitize=hwaddress")
                           .flag("-fexceptions")
                           .flag("+fno-exceptions")
                           .makeMultilib());
   // Use Itanium C++ ABI for the compat multilib.
-  Multilibs.push_back(MultilibBuilder("compat", {}, {}, 6)
+  Multilibs.push_back(MultilibBuilder("compat", {}, {})
                           .flag("+fc++-abi=itanium")
                           .makeMultilib());
 
@@ -299,9 +299,10 @@ Fuchsia::Fuchsia(const Driver &D, const llvm::Triple &Triple,
   });
 
   Multilib::flags_list Flags;
-  addMultilibFlag(
-      Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions, true),
-      "fexceptions", Flags);
+  bool Exceptions =
+      Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions, true);
+  addMultilibFlag(Exceptions, "fexceptions", Flags);
+  addMultilibFlag(!Exceptions, "fno-exceptions", Flags);
   addMultilibFlag(getSanitizerArgs(Args).needsAsanRt(), "fsanitize=address",
                   Flags);
   addMultilibFlag(getSanitizerArgs(Args).needsHwasanRt(), "fsanitize=hwaddress",
diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp
index 71a4ccd042ac8..bd0409d282084 100644
--- a/clang/lib/Driver/ToolChains/OHOS.cpp
+++ b/clang/lib/Driver/ToolChains/OHOS.cpp
@@ -39,14 +39,16 @@ static bool findOHOSMuslMultilibs(const Multilib::flags_list &Flags,
   // -mcpu=cortex-a7
   // -mfloat-abi=soft -mfloat-abi=softfp -mfloat-abi=hard
   // -mfpu=neon-vfpv4
-  Multilibs.push_back(Multilib("/a7_soft", {}, {}, 1,
-                          {"+mcpu=cortex-a7", "+mfloat-abi=soft"}));
+  Multilibs.push_back(
+      Multilib("/a7_soft", {}, {}, {"+mcpu=cortex-a7", "+mfloat-abi=soft"}));
 
-  Multilibs.push_back(Multilib("/a7_softfp_neon-vfpv4", {}, {}, 1,
-                          {"+mcpu=cortex-a7", "+mfloat-abi=softfp", "+mfpu=neon-vfpv4"}));
+  Multilibs.push_back(
+      Multilib("/a7_softfp_neon-vfpv4", {}, {},
+               {"+mcpu=cortex-a7", "+mfloat-abi=softfp", "+mfpu=neon-vfpv4"}));
 
-  Multilibs.push_back(Multilib("/a7_hard_neon-vfpv4", {}, {}, 1,
-                          {"+mcpu=cortex-a7", "+mfloat-abi=hard", "+mfpu=neon-vfpv4"}));
+  Multilibs.push_back(
+      Multilib("/a7_hard_neon-vfpv4", {}, {},
+               {"+mcpu=cortex-a7", "+mfloat-abi=hard", "+mfpu=neon-vfpv4"}));
 
   if (Multilibs.select(Flags, Result.SelectedMultilib)) {
     Result.Multilibs = Multilibs;
diff --git a/clang/unittests/Driver/MultilibTest.cpp b/clang/unittests/Driver/MultilibTest.cpp
index 2e729a5051734..6a066f6b0f5a6 100644
--- a/clang/unittests/Driver/MultilibTest.cpp
+++ b/clang/unittests/Driver/MultilibTest.cpp
@@ -33,14 +33,14 @@ TEST(MultilibTest, OpEqReflexivity2) {
 }
 
 TEST(MultilibTest, OpEqReflexivity3) {
-  Multilib M1({}, {}, {}, 0, {"+foo"});
-  Multilib M2({}, {}, {}, 0, {"+foo"});
+  Multilib M1({}, {}, {}, {"+foo"});
+  Multilib M2({}, {}, {}, {"+foo"});
   ASSERT_TRUE(M1 == M2) << "Multilibs with the same flag should be the same";
 }
 
 TEST(MultilibTest, OpEqInequivalence1) {
-  Multilib M1({}, {}, {}, 0, {"+foo"});
-  Multilib M2({}, {}, {}, 0, {"-foo"});
+  Multilib M1({}, {}, {}, {"+foo"});
+  Multilib M2({}, {}, {}, {"-foo"});
   ASSERT_FALSE(M1 == M2) << "Multilibs with conflicting flags are not the same";
   ASSERT_FALSE(M2 == M1)
       << "Multilibs with conflicting flags are not the same (commuted)";
@@ -48,7 +48,7 @@ TEST(MultilibTest, OpEqInequivalence1) {
 
 TEST(MultilibTest, OpEqInequivalence2) {
   Multilib M1;
-  Multilib M2({}, {}, {}, 0, {"+foo"});
+  Multilib M2({}, {}, {}, {"+foo"});
   ASSERT_FALSE(M1 == M2) << "Flags make Multilibs different";
 }
 
@@ -124,7 +124,7 @@ TEST(MultilibTest, Construction2) {
 }
 
 TEST(MultilibTest, Construction3) {
-  Multilib M({}, {}, {}, 0, {"+f1", "+f2", "-f3"});
+  Multilib M({}, {}, {}, {"+f1", "+f2", "-f3"});
   for (Multilib::flags_list::const_iterator I = M.flags().begin(),
                                             E = M.flags().end();
        I != E; ++I) {
@@ -149,8 +149,8 @@ TEST(MultilibTest, SetPushback) {
 
 TEST(MultilibTest, SetPriority) {
   MultilibSet MS({
-      Multilib("/foo", {}, {}, 1, {"+foo"}),
-      Multilib("/bar", {}, {}, 2, {"+bar"}),
+      Multilib("/foo", {}, {}, {"+foo"}),
+      Multilib("/bar", {}, {}, {"+bar"}),
   });
   Multilib::flags_list Flags1 = {"+foo", "-bar"};
   Multilib Selection1;
@@ -166,3 +166,24 @@ TEST(MultilibTest, SetPriority) {
   ASSERT_TRUE(Selection2.gccSuffix() == "/bar")
       << "Selection picked " << Selection2 << " which was not expected";
 }
+
+TEST(MultilibTest, SelectMultiple) {
+  MultilibSet MS({
+      Multilib("/a", {}, {}, {"x"}),
+      Multilib("/b", {}, {}, {"y"}),
+  });
+  std::vector<Multilib> Selection;
+
+  Selection = MS.select({"x"});
+  ASSERT_EQ(1u, Selection.size());
+  EXPECT_EQ("/a", Selection[0].gccSuffix());
+
+  Selection = MS.select({"y"});
+  ASSERT_EQ(1u, Selection.size());
+  EXPECT_EQ("/b", Selection[0].gccSuffix());
+
+  Selection = MS.select({"y", "x"});
+  ASSERT_EQ(2u, Selection.size());
+  EXPECT_EQ("/a", Selection[0].gccSuffix());
+  EXPECT_EQ("/b", Selection[1].gccSuffix());
+}

From f957b8fe1efe34ac04d1b2e6381e44edcef056b3 Mon Sep 17 00:00:00 2001
From: Carlos Galvez <carlosgalvezp@gmail.com>
Date: Thu, 23 Mar 2023 12:16:40 +0000
Subject: [PATCH 206/208] [clang-tidy][NFC] Improve naming convention in
 google-readability-avoid-underscore-in-googletest-name

According to the Google docs, the convention is
TEST(TestSuiteName, TestName). Apply that convention to the
source code, test and documentation of the check.

Differential Revision: https://reviews.llvm.org/D146713
---
 .../AvoidUnderscoreInGoogletestNameCheck.cpp  |  19 +-
 ...ty-avoid-underscore-in-googletest-name.rst |  20 +-
 .../avoid-underscore-in-googletest-name.cpp   | 226 +++++++++---------
 3 files changed, 133 insertions(+), 132 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
index b903f2552b7e6..d522d6760af1d 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
@@ -47,18 +47,19 @@ class AvoidUnderscoreInGoogletestNameCallback : public PPCallbacks {
     if (!isGoogletestTestMacro(MacroName) || !Args ||
         Args->getNumMacroArguments() < 2)
       return;
-    const Token *TestCaseNameToken = Args->getUnexpArgument(0);
+    const Token *TestSuiteNameToken = Args->getUnexpArgument(0);
     const Token *TestNameToken = Args->getUnexpArgument(1);
-    if (!TestCaseNameToken || !TestNameToken)
+    if (!TestSuiteNameToken || !TestNameToken)
       return;
-    std::string TestCaseNameMaybeDisabled = PP->getSpelling(*TestCaseNameToken);
-    StringRef TestCaseName = TestCaseNameMaybeDisabled;
-    TestCaseName.consume_front(KDisabledTestPrefix);
-    if (TestCaseName.contains('_'))
-      Check->diag(TestCaseNameToken->getLocation(),
-                  "avoid using \"_\" in test case name \"%0\" according to "
+    std::string TestSuiteNameMaybeDisabled =
+        PP->getSpelling(*TestSuiteNameToken);
+    StringRef TestSuiteName = TestSuiteNameMaybeDisabled;
+    TestSuiteName.consume_front(KDisabledTestPrefix);
+    if (TestSuiteName.contains('_'))
+      Check->diag(TestSuiteNameToken->getLocation(),
+                  "avoid using \"_\" in test suite name \"%0\" according to "
                   "Googletest FAQ")
-          << TestCaseName;
+          << TestSuiteName;
 
     std::string TestNameMaybeDisabled = PP->getSpelling(*TestNameToken);
     StringRef TestName = TestNameMaybeDisabled;
diff --git a/clang-tools-extra/docs/clang-tidy/checks/google/readability-avoid-underscore-in-googletest-name.rst b/clang-tools-extra/docs/clang-tidy/checks/google/readability-avoid-underscore-in-googletest-name.rst
index f2053b4d2fcd3..e667fd12222bb 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/google/readability-avoid-underscore-in-googletest-name.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/google/readability-avoid-underscore-in-googletest-name.rst
@@ -3,8 +3,8 @@
 google-readability-avoid-underscore-in-googletest-name
 ======================================================
 
-Checks whether there are underscores in googletest test and test case names in
-test macros:
+Checks whether there are underscores in googletest test suite names and test
+names in test macros:
 
 - ``TEST``
 - ``TEST_F``
@@ -18,17 +18,17 @@ For example:
 
 .. code-block:: c++
 
-  TEST(TestCaseName, Illegal_TestName) {}
-  TEST(Illegal_TestCaseName, TestName) {}
+  TEST(TestSuiteName, Illegal_TestName) {}
+  TEST(Illegal_TestSuiteName, TestName) {}
 
-would trigger the check. `Underscores are not allowed`_ in test names nor test
-case names.
+would trigger the check. `Underscores are not allowed`_ in test suite name nor
+test names.
 
-The ``DISABLED_`` prefix, which may be used to `disable individual tests`_, is
-ignored when checking test names, but the rest of the rest of the test name is
-still checked.
+The ``DISABLED_`` prefix, which may be used to
+`disable test suites and individual tests`_, is removed from the test suite name
+and test name before checking for underscores.
 
 This check does not propose any fixes.
 
 .. _Underscores are not allowed: https://google.github.io/googletest/faq.html#why-should-test-suite-names-and-test-names-not-contain-underscore
-.. _disable individual tests: https://google.github.io/googletest/advanced.html#temporarily-disabling-tests
+.. _disable test suites and individual tests: https://google.github.io/googletest/advanced.html#temporarily-disabling-tests
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp
index 3ab5a6ffe383b..0e43735c2105c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/avoid-underscore-in-googletest-name.cpp
@@ -1,118 +1,118 @@
 // RUN: %check_clang_tidy %s google-readability-avoid-underscore-in-googletest-name %t
 
-#define TEST(test_case_name, test_name) void test_case_name##test_name()
-#define TEST_F(test_case_name, test_name) void test_case_name##test_name()
-#define TEST_P(test_case_name, test_name) void test_case_name##test_name()
-#define TYPED_TEST(test_case_name, test_name) void test_case_name##test_name()
-#define TYPED_TEST_P(test_case_name, test_name) void test_case_name##test_name()
-#define FRIEND_TEST(test_case_name, test_name) void test_case_name##test_name()
-
-TEST(TestCaseName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TEST(TestCaseName, DISABLED_Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST(TestCaseName, Illegal_Test_Name) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST(Illegal_TestCaseName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: avoid using "_" in test case name "Illegal_TestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST(Illegal_Test_CaseName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: avoid using "_" in test case name "Illegal_Test_CaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST(Illegal_TestCaseName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: avoid using "_" in test case name "Illegal_TestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-// CHECK-MESSAGES: :[[@LINE-2]]:28: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TEST_F(TestCaseFixtureName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:29: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST_F(TestCaseFixtureName, DISABLED_Illegal_Test_Name) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:29: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST_F(TestCaseFixtureName, Illegal_Test_Name) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:29: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TEST_F(Illegal_TestCaseFixtureName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test case name "Illegal_TestCaseFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST_F(Illegal_TestCaseFixtureName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test case name "Illegal_TestCaseFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-// CHECK-MESSAGES: :[[@LINE-2]]:37: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TEST_F(Illegal_Test_CaseFixtureName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test case name "Illegal_Test_CaseFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TEST_P(ParameterizedTestCaseFixtureName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:42: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST_P(ParameterizedTestCaseFixtureName, DISABLED_Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:42: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST_P(ParameterizedTestCaseFixtureName, Illegal_Test_Name) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:42: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TEST_P(Illegal_ParameterizedTestCaseFixtureName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test case name "Illegal_ParameterizedTestCaseFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TEST_P(Illegal_ParameterizedTestCaseFixtureName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test case name "Illegal_ParameterizedTestCaseFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-// CHECK-MESSAGES: :[[@LINE-2]]:50: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TEST_P(Illegal_Parameterized_TestCaseFixtureName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test case name "Illegal_Parameterized_TestCaseFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TYPED_TEST(TypedTestCaseName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:31: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TYPED_TEST(TypedTestCaseName, DISABLED_Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:31: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TYPED_TEST(TypedTestCaseName, Illegal_Test_Name) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:31: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TYPED_TEST(Illegal_TypedTestCaseName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: avoid using "_" in test case name "Illegal_TypedTestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TYPED_TEST(Illegal_TypedTestCaseName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: avoid using "_" in test case name "Illegal_TypedTestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-// CHECK-MESSAGES: :[[@LINE-2]]:39: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TYPED_TEST(Illegal_Typed_TestCaseName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: avoid using "_" in test case name "Illegal_Typed_TestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TYPED_TEST_P(TypeParameterizedTestCaseName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:45: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TYPED_TEST_P(TypeParameterizedTestCaseName, DISABLED_Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:45: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TYPED_TEST_P(TypeParameterizedTestCaseName, Illegal_Test_Name) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:45: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TYPED_TEST_P(Illegal_TypeParameterizedTestCaseName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: avoid using "_" in test case name "Illegal_TypeParameterizedTestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-TYPED_TEST_P(Illegal_TypeParameterizedTestCaseName, Illegal_TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: avoid using "_" in test case name "Illegal_TypeParameterizedTestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-// CHECK-MESSAGES: :[[@LINE-2]]:53: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
-
-TYPED_TEST_P(Illegal_Type_ParameterizedTestCaseName, TestName) {}
-// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: avoid using "_" in test case name "Illegal_Type_ParameterizedTestCaseName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+#define TEST(test_suite_name, test_name) void test_suite_name##test_name()
+#define TEST_F(test_suite_name, test_name) void test_suite_name##test_name()
+#define TEST_P(test_suite_name, test_name) void test_suite_name##test_name()
+#define TYPED_TEST(test_suite_name, test_name) void test_suite_name##test_name()
+#define TYPED_TEST_P(test_suite_name, test_name) void test_suite_name##test_name()
+#define FRIEND_TEST(test_suite_name, test_name) void test_suite_name##test_name()
+
+TEST(TestSuiteName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TEST(TestSuiteName, DISABLED_Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST(TestSuiteName, Illegal_Test_Name) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST(Illegal_TestSuiteName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: avoid using "_" in test suite name "Illegal_TestSuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST(Illegal_Test_SuiteName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: avoid using "_" in test suite name "Illegal_Test_SuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST(Illegal_TestSuiteName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: avoid using "_" in test suite name "Illegal_TestSuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+// CHECK-MESSAGES: :[[@LINE-2]]:29: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TEST_F(TestSuiteFixtureName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:30: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST_F(TestSuiteFixtureName, DISABLED_Illegal_Test_Name) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:30: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST_F(TestSuiteFixtureName, Illegal_Test_Name) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:30: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TEST_F(Illegal_TestSuiteFixtureName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test suite name "Illegal_TestSuiteFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST_F(Illegal_TestSuiteFixtureName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test suite name "Illegal_TestSuiteFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+// CHECK-MESSAGES: :[[@LINE-2]]:38: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TEST_F(Illegal_Test_SuiteFixtureName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test suite name "Illegal_Test_SuiteFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TEST_P(ParameterizedTestSuiteFixtureName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:43: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST_P(ParameterizedTestSuiteFixtureName, DISABLED_Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:43: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST_P(ParameterizedTestSuiteFixtureName, Illegal_Test_Name) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:43: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TEST_P(Illegal_ParameterizedTestSuiteFixtureName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test suite name "Illegal_ParameterizedTestSuiteFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TEST_P(Illegal_ParameterizedTestSuiteFixtureName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test suite name "Illegal_ParameterizedTestSuiteFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+// CHECK-MESSAGES: :[[@LINE-2]]:51: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TEST_P(Illegal_Parameterized_TestSuiteFixtureName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: avoid using "_" in test suite name "Illegal_Parameterized_TestSuiteFixtureName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TYPED_TEST(TypedTestSuiteName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:32: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TYPED_TEST(TypedTestSuiteName, DISABLED_Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:32: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TYPED_TEST(TypedTestSuiteName, Illegal_Test_Name) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:32: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TYPED_TEST(Illegal_TypedTestSuiteName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: avoid using "_" in test suite name "Illegal_TypedTestSuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TYPED_TEST(Illegal_TypedTestSuiteName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: avoid using "_" in test suite name "Illegal_TypedTestSuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+// CHECK-MESSAGES: :[[@LINE-2]]:40: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TYPED_TEST(Illegal_Typed_TestSuiteName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: avoid using "_" in test suite name "Illegal_Typed_TestSuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TYPED_TEST_P(TypeParameterizedTestSuiteName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:46: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TYPED_TEST_P(TypeParameterizedTestSuiteName, DISABLED_Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:46: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TYPED_TEST_P(TypeParameterizedTestSuiteName, Illegal_Test_Name) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:46: warning: avoid using "_" in test name "Illegal_Test_Name" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TYPED_TEST_P(Illegal_TypeParameterizedTestSuiteName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: avoid using "_" in test suite name "Illegal_TypeParameterizedTestSuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+TYPED_TEST_P(Illegal_TypeParameterizedTestSuiteName, Illegal_TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: avoid using "_" in test suite name "Illegal_TypeParameterizedTestSuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+// CHECK-MESSAGES: :[[@LINE-2]]:54: warning: avoid using "_" in test name "Illegal_TestName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
+
+TYPED_TEST_P(Illegal_Type_ParameterizedTestSuiteName, TestName) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: avoid using "_" in test suite name "Illegal_Type_ParameterizedTestSuiteName" according to Googletest FAQ [google-readability-avoid-underscore-in-googletest-name]
 
 // Underscores are allowed to disable a test with the DISABLED_ prefix.
 // https://google.github.io/googletest/faq.html#why-should-test-suite-names-and-test-names-not-contain-underscore
-TEST(TestCaseName, TestName) {}
-TEST(TestCaseName, DISABLED_TestName) {}
-TEST(DISABLED_TestCaseName, TestName) {}
-TEST(DISABLED_TestCaseName, DISABLED_TestName) {}
-
-TEST_F(TestCaseFixtureName, TestName) {}
-TEST_F(TestCaseFixtureName, DISABLED_TestName) {}
-TEST_F(DISABLED_TestCaseFixtureName, TestName) {}
-TEST_F(DISABLED_TestCaseFixtureName, DISABLED_TestName) {}
-
-TEST_P(ParameterizedTestCaseFixtureName, TestName) {}
-TEST_P(ParameterizedTestCaseFixtureName, DISABLED_TestName) {}
-TEST_P(DISABLED_ParameterizedTestCaseFixtureName, TestName) {}
-TEST_P(DISABLED_ParameterizedTestCaseFixtureName, DISABLED_TestName) {}
-
-TYPED_TEST(TypedTestName, TestName) {}
-TYPED_TEST(TypedTestName, DISABLED_TestName) {}
-TYPED_TEST(DISABLED_TypedTestName, TestName) {}
-TYPED_TEST(DISABLED_TypedTestName, DISABLED_TestName) {}
-
-TYPED_TEST_P(TypeParameterizedTestName, TestName) {}
-TYPED_TEST_P(TypeParameterizedTestName, DISABLED_TestName) {}
-TYPED_TEST_P(DISABLED_TypeParameterizedTestName, TestName) {}
-TYPED_TEST_P(DISABLED_TypeParameterizedTestName, DISABLED_TestName) {}
-
-FRIEND_TEST(FriendTest, Is_NotChecked) {}
-FRIEND_TEST(Friend_Test, IsNotChecked) {}
-FRIEND_TEST(Friend_Test, Is_NotChecked) {}
+TEST(TestSuiteName, TestName) {}
+TEST(TestSuiteName, DISABLED_TestName) {}
+TEST(DISABLED_TestSuiteName, TestName) {}
+TEST(DISABLED_TestSuiteName, DISABLED_TestName) {}
+
+TEST_F(TestSuiteFixtureName, TestName) {}
+TEST_F(TestSuiteFixtureName, DISABLED_TestName) {}
+TEST_F(DISABLED_TestSuiteFixtureName, TestName) {}
+TEST_F(DISABLED_TestSuiteFixtureName, DISABLED_TestName) {}
+
+TEST_P(ParameterizedTestSuiteFixtureName, TestName) {}
+TEST_P(ParameterizedTestSuiteFixtureName, DISABLED_TestName) {}
+TEST_P(DISABLED_ParameterizedTestSuiteFixtureName, TestName) {}
+TEST_P(DISABLED_ParameterizedTestSuiteFixtureName, DISABLED_TestName) {}
+
+TYPED_TEST(TypedTestSuiteName, TestName) {}
+TYPED_TEST(TypedTestSuiteName, DISABLED_TestName) {}
+TYPED_TEST(DISABLED_TypedTestSuiteName, TestName) {}
+TYPED_TEST(DISABLED_TypedTestSuiteName, DISABLED_TestName) {}
+
+TYPED_TEST_P(TypeParameterizedTestSuiteName, TestName) {}
+TYPED_TEST_P(TypeParameterizedTestSuiteName, DISABLED_TestName) {}
+TYPED_TEST_P(DISABLED_TypeParameterizedTestSuiteName, TestName) {}
+TYPED_TEST_P(DISABLED_TypeParameterizedTestSuiteName, DISABLED_TestName) {}
+
+FRIEND_TEST(FriendTestSuite, Is_NotChecked) {}
+FRIEND_TEST(Friend_TestSuite, IsNotChecked) {}
+FRIEND_TEST(Friend_TestSuite, Is_NotChecked) {}

From b0cd5b2a476063b588c59325720c841d79ed3262 Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi@nextsilicon.com>
Date: Fri, 24 Mar 2023 07:57:24 +0100
Subject: [PATCH 207/208] [mlir][llvm] Switch remaining LLVM dialect tests to
 opaque pointers.

The revision switches the remaining LLVM dialect tests to use opaque
pointers. Selected tests are copied to a postfixed test file for the
time being.

A number of tests disappear once we fully switch to opaque pointers.
In particular, all tests that check verify a pointer element type
matches another type as well as tests of recursive types.

Part of https://discourse.llvm.org/t/rfc-switching-the-llvm-dialect-and-dialect-lowerings-to-opaque-pointers/68179

Reviewed By: Dinistro, zero9178

Differential Revision: https://reviews.llvm.org/D146726
---
 mlir/test/Dialect/LLVMIR/callgraph.mlir       |  27 +-
 .../LLVMIR/canonicalize-typed-pointers.mlir   |  86 ++++++
 mlir/test/Dialect/LLVMIR/canonicalize.mlir    |  55 ++--
 mlir/test/Dialect/LLVMIR/debuginfo.mlir       |   4 +-
 .../dynamic-gep-index-typed-pointers.mlir     |  12 +
 .../Dialect/LLVMIR/dynamic-gep-index.mlir     |   6 +-
 mlir/test/Dialect/LLVMIR/func.mlir            |  62 ++--
 .../Dialect/LLVMIR/global-typed-pointers.mlir |  46 +++
 mlir/test/Dialect/LLVMIR/global.mlir          |  47 +--
 .../LLVMIR/invalid-typed-pointers.mlir        | 283 ++++++++++++++++++
 mlir/test/Dialect/LLVMIR/invalid.mlir         | 187 +++++-------
 .../Dialect/LLVMIR/layout-typed-pointers.mlir | 145 +++++++++
 mlir/test/Dialect/LLVMIR/layout.mlir          |  78 +----
 .../Dialect/LLVMIR/nvvm-typed-pointers.mlir   |  55 ++++
 mlir/test/Dialect/LLVMIR/nvvm.mlir            |  30 +-
 ...arameter-attrs-invalid-typed-pointers.mlir |   6 +
 .../LLVMIR/parameter-attrs-invalid.mlir       |   5 -
 .../LLVMIR/types-invalid-typed-pointers.mlir  |  42 +++
 mlir/test/Dialect/LLVMIR/types-invalid.mlir   |  17 +-
 .../Dialect/LLVMIR/types-typed-pointers.mlir  | 118 ++++++++
 mlir/test/Dialect/LLVMIR/types.mlir           |  62 +---
 21 files changed, 991 insertions(+), 382 deletions(-)
 create mode 100644 mlir/test/Dialect/LLVMIR/canonicalize-typed-pointers.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/dynamic-gep-index-typed-pointers.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/global-typed-pointers.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/invalid-typed-pointers.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/layout-typed-pointers.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/nvvm-typed-pointers.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/parameter-attrs-invalid-typed-pointers.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/types-invalid-typed-pointers.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/types-typed-pointers.mlir

diff --git a/mlir/test/Dialect/LLVMIR/callgraph.mlir b/mlir/test/Dialect/LLVMIR/callgraph.mlir
index edb5b35d126a5..ca1044b8288c4 100644
--- a/mlir/test/Dialect/LLVMIR/callgraph.mlir
+++ b/mlir/test/Dialect/LLVMIR/callgraph.mlir
@@ -58,33 +58,32 @@ module attributes {"test.name" = "Invoke call"} {
   // CHECK-DAG: -- Call-Edge : <Unknown-Callee-Node>
 
   // CHECK: -- SCCs --
-  llvm.mlir.global external constant @_ZTIi() : !llvm.ptr<i8>
+  llvm.mlir.global external constant @_ZTIi() : !llvm.ptr
   llvm.func @foo(%arg0: i32) -> !llvm.struct<(i32, f64, i32)>
-  llvm.func @bar(!llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>)
+  llvm.func @bar(!llvm.ptr, !llvm.ptr, !llvm.ptr)
   llvm.func @__gxx_personality_v0(...) -> i32
 
   llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personality_v0 } {
     %0 = llvm.mlir.constant(0 : i32) : i32
     %1 = llvm.mlir.constant(3 : i32) : i32
     %2 = llvm.mlir.constant("\01") : !llvm.array<1 x i8>
-    %3 = llvm.mlir.null : !llvm.ptr<ptr<i8>>
-    %4 = llvm.mlir.null : !llvm.ptr<i8>
-    %5 = llvm.mlir.addressof @_ZTIi : !llvm.ptr<ptr<i8>>
-    %6 = llvm.bitcast %5 : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
-    %7 = llvm.mlir.constant(1 : i32) : i32
-    %8 = llvm.alloca %7 x i8 : (i32) -> !llvm.ptr<i8>
-    %9 = llvm.invoke @foo(%7) to ^bb2 unwind ^bb1 : (i32) -> !llvm.struct<(i32, f64, i32)>
+    %3 = llvm.mlir.null : !llvm.ptr
+    %4 = llvm.mlir.null : !llvm.ptr
+    %5 = llvm.mlir.addressof @_ZTIi : !llvm.ptr
+    %6 = llvm.mlir.constant(1 : i32) : i32
+    %7 = llvm.alloca %6 x i8 : (i32) -> !llvm.ptr
+    %8 = llvm.invoke @foo(%6) to ^bb2 unwind ^bb1 : (i32) -> !llvm.struct<(i32, f64, i32)>
 
   ^bb1:
-    %10 = llvm.landingpad cleanup (catch %3 : !llvm.ptr<ptr<i8>>) (catch %6 : !llvm.ptr<i8>) (filter %2 : !llvm.array<1 x i8>) : !llvm.struct<(ptr<i8>, i32)>
-    %11 = llvm.intr.eh.typeid.for %6 : (!llvm.ptr<i8>) -> i32
-    llvm.resume %10 : !llvm.struct<(ptr<i8>, i32)>
+    %10 = llvm.landingpad cleanup (catch %3 : !llvm.ptr) (catch %5 : !llvm.ptr) (filter %2 : !llvm.array<1 x i8>) : !llvm.struct<(ptr, i32)>
+    %11 = llvm.intr.eh.typeid.for %5 : (!llvm.ptr) -> i32
+    llvm.resume %10 : !llvm.struct<(ptr, i32)>
 
   ^bb2:
-    llvm.return %7 : i32
+    llvm.return %6 : i32
 
   ^bb3:
-    llvm.invoke @bar(%8, %6, %4) to ^bb2 unwind ^bb1 : (!llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+    llvm.invoke @bar(%7, %5, %4) to ^bb2 unwind ^bb1 : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
 
   ^bb4:
     llvm.return %0 : i32
diff --git a/mlir/test/Dialect/LLVMIR/canonicalize-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/canonicalize-typed-pointers.mlir
new file mode 100644
index 0000000000000..2ae9727482fc3
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/canonicalize-typed-pointers.mlir
@@ -0,0 +1,86 @@
+// RUN: mlir-opt --pass-pipeline='builtin.module(llvm.func(canonicalize{test-convergence}))' %s -split-input-file | FileCheck %s
+
+// CHECK-LABEL: fold_bitcast
+// CHECK-SAME: %[[a0:arg[0-9]+]]
+// CHECK-NEXT: llvm.return %[[a0]]
+llvm.func @fold_bitcast(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+  %c = llvm.bitcast %x : !llvm.ptr<i8> to !llvm.ptr<i8>
+  llvm.return %c : !llvm.ptr<i8>
+}
+
+// CHECK-LABEL: fold_bitcast2
+// CHECK-SAME: %[[a0:arg[0-9]+]]
+// CHECK-NEXT: llvm.return %[[a0]]
+llvm.func @fold_bitcast2(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+  %c = llvm.bitcast %x : !llvm.ptr<i8> to !llvm.ptr<i32>
+  %d = llvm.bitcast %c : !llvm.ptr<i32> to !llvm.ptr<i8>
+  llvm.return %d : !llvm.ptr<i8>
+}
+
+// -----
+
+// CHECK-LABEL: fold_addrcast
+// CHECK-SAME: %[[a0:arg[0-9]+]]
+// CHECK-NEXT: llvm.return %[[a0]]
+llvm.func @fold_addrcast(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+  %c = llvm.addrspacecast %x : !llvm.ptr<i8> to !llvm.ptr<i8>
+  llvm.return %c : !llvm.ptr<i8>
+}
+
+// CHECK-LABEL: fold_addrcast2
+// CHECK-SAME: %[[a0:arg[0-9]+]]
+// CHECK-NEXT: llvm.return %[[a0]]
+llvm.func @fold_addrcast2(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+  %c = llvm.addrspacecast %x : !llvm.ptr<i8> to !llvm.ptr<i32, 5>
+  %d = llvm.addrspacecast %c : !llvm.ptr<i32, 5> to !llvm.ptr<i8>
+  llvm.return %d : !llvm.ptr<i8>
+}
+
+// -----
+
+// CHECK-LABEL: fold_gep
+// CHECK-SAME: %[[a0:arg[0-9]+]]
+// CHECK-NEXT: llvm.return %[[a0]]
+llvm.func @fold_gep(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+  %c0 = arith.constant 0 : i32
+  %c = llvm.getelementptr %x[%c0] : (!llvm.ptr<i8>, i32) -> !llvm.ptr<i8>
+  llvm.return %c : !llvm.ptr<i8>
+}
+
+// -----
+
+// CHECK-LABEL: fold_gep_canon
+// CHECK-SAME: %[[a0:arg[0-9]+]]
+// CHECK-NEXT: %[[RES:.*]] = llvm.getelementptr %[[a0]][2]
+// CHECK-NEXT: llvm.return %[[RES]]
+llvm.func @fold_gep_canon(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+  %c2 = arith.constant 2 : i32
+  %c = llvm.getelementptr %x[%c2] : (!llvm.ptr<i8>, i32) -> !llvm.ptr<i8>
+  llvm.return %c : !llvm.ptr<i8>
+}
+
+// -----
+
+// CHECK-LABEL: load_dce
+// CHECK-NEXT: llvm.return
+llvm.func @load_dce(%x : !llvm.ptr<i8>) {
+  %0 = llvm.load %x : !llvm.ptr<i8>
+  llvm.return
+}
+
+llvm.mlir.global external @fp() : !llvm.ptr<i8>
+
+// CHECK-LABEL: addr_dce
+// CHECK-NEXT: llvm.return
+llvm.func @addr_dce(%x : !llvm.ptr<i8>) {
+  %0 = llvm.mlir.addressof @fp : !llvm.ptr<ptr<i8>>
+  llvm.return
+}
+
+// CHECK-LABEL: alloca_dce
+// CHECK-NEXT: llvm.return
+llvm.func @alloca_dce() {
+  %c1_i64 = arith.constant 1 : i64
+  %0 = llvm.alloca %c1_i64 x i32 : (i64) -> !llvm.ptr<i32>
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/canonicalize.mlir b/mlir/test/Dialect/LLVMIR/canonicalize.mlir
index 7fa7684f5ace0..6b2cac14f2985 100644
--- a/mlir/test/Dialect/LLVMIR/canonicalize.mlir
+++ b/mlir/test/Dialect/LLVMIR/canonicalize.mlir
@@ -37,8 +37,8 @@ llvm.func @no_fold_extractvalue(%arr: !llvm.array<4 x f32>) -> f32 {
   %3 = llvm.extractvalue %2[0, 0] : !llvm.array<4 x !llvm.array<4 x f32>>
 
   llvm.return %3 : f32
-
 }
+
 // -----
 
 // CHECK-LABEL: fold_unrelated_extractvalue
@@ -56,18 +56,18 @@ llvm.func @fold_unrelated_extractvalue(%arr: !llvm.array<4 x f32>) -> f32 {
 // CHECK-LABEL: fold_bitcast
 // CHECK-SAME: %[[a0:arg[0-9]+]]
 // CHECK-NEXT: llvm.return %[[a0]]
-llvm.func @fold_bitcast(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
-  %c = llvm.bitcast %x : !llvm.ptr<i8> to !llvm.ptr<i8>
-  llvm.return %c : !llvm.ptr<i8>
+llvm.func @fold_bitcast(%x : !llvm.ptr) -> !llvm.ptr {
+  %c = llvm.bitcast %x : !llvm.ptr to !llvm.ptr
+  llvm.return %c : !llvm.ptr
 }
 
 // CHECK-LABEL: fold_bitcast2
 // CHECK-SAME: %[[a0:arg[0-9]+]]
 // CHECK-NEXT: llvm.return %[[a0]]
-llvm.func @fold_bitcast2(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
-  %c = llvm.bitcast %x : !llvm.ptr<i8> to !llvm.ptr<i32>
-  %d = llvm.bitcast %c : !llvm.ptr<i32> to !llvm.ptr<i8>
-  llvm.return %d : !llvm.ptr<i8>
+llvm.func @fold_bitcast2(%x : i32) -> i32 {
+  %c = llvm.bitcast %x : i32 to f32
+  %d = llvm.bitcast %c : f32 to i32
+  llvm.return %d : i32
 }
 
 // -----
@@ -75,18 +75,18 @@ llvm.func @fold_bitcast2(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
 // CHECK-LABEL: fold_addrcast
 // CHECK-SAME: %[[a0:arg[0-9]+]]
 // CHECK-NEXT: llvm.return %[[a0]]
-llvm.func @fold_addrcast(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
-  %c = llvm.addrspacecast %x : !llvm.ptr<i8> to !llvm.ptr<i8>
-  llvm.return %c : !llvm.ptr<i8>
+llvm.func @fold_addrcast(%x : !llvm.ptr) -> !llvm.ptr {
+  %c = llvm.addrspacecast %x : !llvm.ptr to !llvm.ptr
+  llvm.return %c : !llvm.ptr
 }
 
 // CHECK-LABEL: fold_addrcast2
 // CHECK-SAME: %[[a0:arg[0-9]+]]
 // CHECK-NEXT: llvm.return %[[a0]]
-llvm.func @fold_addrcast2(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
-  %c = llvm.addrspacecast %x : !llvm.ptr<i8> to !llvm.ptr<i32, 5>
-  %d = llvm.addrspacecast %c : !llvm.ptr<i32, 5> to !llvm.ptr<i8>
-  llvm.return %d : !llvm.ptr<i8>
+llvm.func @fold_addrcast2(%x : !llvm.ptr) -> !llvm.ptr {
+  %c = llvm.addrspacecast %x : !llvm.ptr to !llvm.ptr<5>
+  %d = llvm.addrspacecast %c : !llvm.ptr<5> to !llvm.ptr
+  llvm.return %d : !llvm.ptr
 }
 
 // -----
@@ -94,10 +94,10 @@ llvm.func @fold_addrcast2(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
 // CHECK-LABEL: fold_gep
 // CHECK-SAME: %[[a0:arg[0-9]+]]
 // CHECK-NEXT: llvm.return %[[a0]]
-llvm.func @fold_gep(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+llvm.func @fold_gep(%x : !llvm.ptr) -> !llvm.ptr {
   %c0 = arith.constant 0 : i32
-  %c = llvm.getelementptr %x[%c0] : (!llvm.ptr<i8>, i32) -> !llvm.ptr<i8>
-  llvm.return %c : !llvm.ptr<i8>
+  %c = llvm.getelementptr %x[%c0] : (!llvm.ptr, i32) -> !llvm.ptr, i8
+  llvm.return %c : !llvm.ptr
 }
 
 // CHECK-LABEL: fold_gep_neg
@@ -114,13 +114,12 @@ llvm.func @fold_gep_neg(%x : !llvm.ptr) -> !llvm.ptr {
 // CHECK-SAME: %[[a0:arg[0-9]+]]
 // CHECK-NEXT: %[[RES:.*]] = llvm.getelementptr %[[a0]][2]
 // CHECK-NEXT: llvm.return %[[RES]]
-llvm.func @fold_gep_canon(%x : !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+llvm.func @fold_gep_canon(%x : !llvm.ptr) -> !llvm.ptr {
   %c2 = arith.constant 2 : i32
-  %c = llvm.getelementptr %x[%c2] : (!llvm.ptr<i8>, i32) -> !llvm.ptr<i8>
-  llvm.return %c : !llvm.ptr<i8>
+  %c = llvm.getelementptr %x[%c2] : (!llvm.ptr, i32) -> !llvm.ptr, i8
+  llvm.return %c : !llvm.ptr
 }
 
-
 // -----
 
 // Check that LLVM constants participate in cross-dialect constant folding. The
@@ -142,17 +141,17 @@ llvm.func @llvm_constant() -> i32 {
 
 // CHECK-LABEL: load_dce
 // CHECK-NEXT: llvm.return
-llvm.func @load_dce(%x : !llvm.ptr<i8>) {
-  %0 = llvm.load %x : !llvm.ptr<i8>
+llvm.func @load_dce(%x : !llvm.ptr) {
+  %0 = llvm.load %x : !llvm.ptr -> i8
   llvm.return
 }
 
-llvm.mlir.global external @fp() : !llvm.ptr<i8>
+llvm.mlir.global external @fp() : !llvm.ptr
 
 // CHECK-LABEL: addr_dce
 // CHECK-NEXT: llvm.return
-llvm.func @addr_dce(%x : !llvm.ptr<i8>) {
-  %0 = llvm.mlir.addressof @fp : !llvm.ptr<ptr<i8>>
+llvm.func @addr_dce(%x : !llvm.ptr) {
+  %0 = llvm.mlir.addressof @fp : !llvm.ptr
   llvm.return
 }
 
@@ -160,6 +159,6 @@ llvm.func @addr_dce(%x : !llvm.ptr<i8>) {
 // CHECK-NEXT: llvm.return
 llvm.func @alloca_dce() {
   %c1_i64 = arith.constant 1 : i64
-  %0 = llvm.alloca %c1_i64 x i32 : (i64) -> !llvm.ptr<i32>
+  %0 = llvm.alloca %c1_i64 x i32 : (i64) -> !llvm.ptr
   llvm.return
 }
diff --git a/mlir/test/Dialect/LLVMIR/debuginfo.mlir b/mlir/test/Dialect/LLVMIR/debuginfo.mlir
index 7aaef0d31bb9d..f7517b2f23108 100644
--- a/mlir/test/Dialect/LLVMIR/debuginfo.mlir
+++ b/mlir/test/Dialect/LLVMIR/debuginfo.mlir
@@ -134,10 +134,10 @@
 llvm.func @addr(%arg: i64) {
   // CHECK: %[[ALLOC:.*]] = llvm.alloca
   %allocCount = llvm.mlir.constant(1 : i32) : i32
-  %alloc = llvm.alloca %allocCount x i64 : (i32) -> !llvm.ptr<i64>
+  %alloc = llvm.alloca %allocCount x i64 : (i32) -> !llvm.ptr
 
   // CHECK: llvm.intr.dbg.declare #[[VAR0]] = %[[ALLOC]]
-  llvm.intr.dbg.declare #var0 = %alloc : !llvm.ptr<i64>
+  llvm.intr.dbg.declare #var0 = %alloc : !llvm.ptr
   llvm.return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/dynamic-gep-index-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/dynamic-gep-index-typed-pointers.mlir
new file mode 100644
index 0000000000000..9e14b1db3432b
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/dynamic-gep-index-typed-pointers.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<i64, dense<[32, 64]> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>>} {
+  // CHECK: llvm.func @foo(%[[ARG0:.+]]: !llvm.ptr<struct<"my_struct", {{.+}}>>, %[[ARG1:.+]]: i32)
+  llvm.func @foo(%arg0: !llvm.ptr<struct<"my_struct", (struct<"sub_struct", (i32, i8)>, array<4 x i32>)>>, %arg1: i32) {
+    // CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: llvm.getelementptr %[[ARG0]][%[[C0]], 1, %[[ARG1]]]
+    %1 = "llvm.getelementptr"(%arg0, %0, %arg1) {rawConstantIndices = array<i32: -2147483648, 1, -2147483648>} : (!llvm.ptr<struct<"my_struct", (struct<"sub_struct", (i32, i8)>, array<4 x i32>)>>, i32, i32) -> !llvm.ptr<i32>
+    llvm.return
+  }
+}
diff --git a/mlir/test/Dialect/LLVMIR/dynamic-gep-index.mlir b/mlir/test/Dialect/LLVMIR/dynamic-gep-index.mlir
index 9e14b1db3432b..f5808134ea026 100644
--- a/mlir/test/Dialect/LLVMIR/dynamic-gep-index.mlir
+++ b/mlir/test/Dialect/LLVMIR/dynamic-gep-index.mlir
@@ -1,12 +1,12 @@
 // RUN: mlir-opt %s | FileCheck %s
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<i64, dense<[32, 64]> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>>} {
-  // CHECK: llvm.func @foo(%[[ARG0:.+]]: !llvm.ptr<struct<"my_struct", {{.+}}>>, %[[ARG1:.+]]: i32)
-  llvm.func @foo(%arg0: !llvm.ptr<struct<"my_struct", (struct<"sub_struct", (i32, i8)>, array<4 x i32>)>>, %arg1: i32) {
+  // CHECK: llvm.func @foo(%[[ARG0:.+]]: !llvm.ptr, %[[ARG1:.+]]: i32)
+  llvm.func @foo(%arg0: !llvm.ptr, %arg1: i32) {
     // CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
     %0 = llvm.mlir.constant(0 : i32) : i32
     // CHECK: llvm.getelementptr %[[ARG0]][%[[C0]], 1, %[[ARG1]]]
-    %1 = "llvm.getelementptr"(%arg0, %0, %arg1) {rawConstantIndices = array<i32: -2147483648, 1, -2147483648>} : (!llvm.ptr<struct<"my_struct", (struct<"sub_struct", (i32, i8)>, array<4 x i32>)>>, i32, i32) -> !llvm.ptr<i32>
+    %1 = "llvm.getelementptr"(%arg0, %0, %arg1) {elem_type = !llvm.struct<"my_struct", (struct<"sub_struct", (i32, i8)>, array<4 x i32>)>, rawConstantIndices = array<i32: -2147483648, 1, -2147483648>} : (!llvm.ptr, i32, i32) -> !llvm.ptr
     llvm.return
   }
 }
diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir
index 5cc7d75b627fa..50f6c6a0e56f5 100644
--- a/mlir/test/Dialect/LLVMIR/func.mlir
+++ b/mlir/test/Dialect/LLVMIR/func.mlir
@@ -33,10 +33,10 @@ module {
   // GENERIC-SAME: () -> ()
   }) {sym_name = "baz", function_type = !llvm.func<i64 (i64)>} : () -> ()
 
-  // CHECK: llvm.func @qux(!llvm.ptr<i64> {llvm.noalias}, i64)
+  // CHECK: llvm.func @qux(!llvm.ptr {llvm.noalias}, i64)
   // CHECK: attributes {xxx = {yyy = 42 : i64}}
   "llvm.func"() ({
-  }) {sym_name = "qux", function_type = !llvm.func<void (ptr<i64>, i64)>,
+  }) {sym_name = "qux", function_type = !llvm.func<void (ptr, i64)>,
       arg_attrs = [{llvm.noalias}, {}], xxx = {yyy = 42}} : () -> ()
 
   // CHECK: llvm.func @roundtrip1()
@@ -71,56 +71,56 @@ module {
   // CHECK: llvm.func @roundtrip8() -> i32
   llvm.func @roundtrip8() -> i32 attributes {}
 
-  // CHECK: llvm.func @roundtrip9(!llvm.ptr<i32> {llvm.noalias})
-  llvm.func @roundtrip9(!llvm.ptr<i32> {llvm.noalias})
+  // CHECK: llvm.func @roundtrip9(!llvm.ptr {llvm.noalias})
+  llvm.func @roundtrip9(!llvm.ptr {llvm.noalias})
 
-  // CHECK: llvm.func @roundtrip10(!llvm.ptr<i32> {llvm.noalias})
-  llvm.func @roundtrip10(%arg0: !llvm.ptr<i32> {llvm.noalias})
+  // CHECK: llvm.func @roundtrip10(!llvm.ptr {llvm.noalias})
+  llvm.func @roundtrip10(%arg0: !llvm.ptr {llvm.noalias})
 
-  // CHECK: llvm.func @roundtrip11(%{{.*}}: !llvm.ptr<i32> {llvm.noalias}) {
-  llvm.func @roundtrip11(%arg0: !llvm.ptr<i32> {llvm.noalias}) {
+  // CHECK: llvm.func @roundtrip11(%{{.*}}: !llvm.ptr {llvm.noalias}) {
+  llvm.func @roundtrip11(%arg0: !llvm.ptr {llvm.noalias}) {
     llvm.return
   }
 
-  // CHECK: llvm.func @roundtrip12(%{{.*}}: !llvm.ptr<i32> {llvm.noalias})
+  // CHECK: llvm.func @roundtrip12(%{{.*}}: !llvm.ptr {llvm.noalias})
   // CHECK: attributes {foo = 42 : i32}
-  llvm.func @roundtrip12(%arg0: !llvm.ptr<i32> {llvm.noalias})
+  llvm.func @roundtrip12(%arg0: !llvm.ptr {llvm.noalias})
   attributes {foo = 42 : i32} {
     llvm.return
   }
 
-  // CHECK: llvm.func @byvalattr(%{{.*}}: !llvm.ptr<i32> {llvm.byval = i32})
-  llvm.func @byvalattr(%arg0: !llvm.ptr<i32> {llvm.byval = i32}) {
+  // CHECK: llvm.func @byvalattr(%{{.*}}: !llvm.ptr {llvm.byval = i32})
+  llvm.func @byvalattr(%arg0: !llvm.ptr {llvm.byval = i32}) {
     llvm.return
   }
 
-  // CHECK: llvm.func @sretattr(%{{.*}}: !llvm.ptr<i32> {llvm.sret = i32})
-  // LOCINFO: llvm.func @sretattr(%{{.*}}: !llvm.ptr<i32> {llvm.sret = i32} loc("some_source_loc"))
-  llvm.func @sretattr(%arg0: !llvm.ptr<i32> {llvm.sret = i32} loc("some_source_loc")) {
+  // CHECK: llvm.func @sretattr(%{{.*}}: !llvm.ptr {llvm.sret = i32})
+  // LOCINFO: llvm.func @sretattr(%{{.*}}: !llvm.ptr {llvm.sret = i32} loc("some_source_loc"))
+  llvm.func @sretattr(%arg0: !llvm.ptr {llvm.sret = i32} loc("some_source_loc")) {
     llvm.return
   }
 
-  // CHECK: llvm.func @nestattr(%{{.*}}: !llvm.ptr<i32> {llvm.nest})
-  llvm.func @nestattr(%arg0: !llvm.ptr<i32> {llvm.nest}) {
+  // CHECK: llvm.func @nestattr(%{{.*}}: !llvm.ptr {llvm.nest})
+  llvm.func @nestattr(%arg0: !llvm.ptr {llvm.nest}) {
     llvm.return
   }
 
-  // CHECK: llvm.func @llvm_noalias_decl(!llvm.ptr<f32> {llvm.noalias})
-  llvm.func @llvm_noalias_decl(!llvm.ptr<f32> {llvm.noalias})
-  // CHECK: llvm.func @byrefattr_decl(!llvm.ptr<i32> {llvm.byref = i32})
-  llvm.func @byrefattr_decl(!llvm.ptr<i32> {llvm.byref = i32})
-  // CHECK: llvm.func @byvalattr_decl(!llvm.ptr<i32> {llvm.byval = i32})
-  llvm.func @byvalattr_decl(!llvm.ptr<i32> {llvm.byval = i32})
-  // CHECK: llvm.func @sretattr_decl(!llvm.ptr<i32> {llvm.sret = i32})
-  llvm.func @sretattr_decl(!llvm.ptr<i32> {llvm.sret = i32})
-  // CHECK: llvm.func @nestattr_decl(!llvm.ptr<i32> {llvm.nest})
-  llvm.func @nestattr_decl(!llvm.ptr<i32> {llvm.nest})
+  // CHECK: llvm.func @llvm_noalias_decl(!llvm.ptr {llvm.noalias})
+  llvm.func @llvm_noalias_decl(!llvm.ptr {llvm.noalias})
+  // CHECK: llvm.func @byrefattr_decl(!llvm.ptr {llvm.byref = i32})
+  llvm.func @byrefattr_decl(!llvm.ptr {llvm.byref = i32})
+  // CHECK: llvm.func @byvalattr_decl(!llvm.ptr {llvm.byval = i32})
+  llvm.func @byvalattr_decl(!llvm.ptr {llvm.byval = i32})
+  // CHECK: llvm.func @sretattr_decl(!llvm.ptr {llvm.sret = i32})
+  llvm.func @sretattr_decl(!llvm.ptr {llvm.sret = i32})
+  // CHECK: llvm.func @nestattr_decl(!llvm.ptr {llvm.nest})
+  llvm.func @nestattr_decl(!llvm.ptr {llvm.nest})
   // CHECK: llvm.func @noundefattr_decl(i32 {llvm.noundef})
   llvm.func @noundefattr_decl(i32 {llvm.noundef})
-  // CHECK: llvm.func @llvm_align_decl(!llvm.ptr<f32> {llvm.align = 4 : i64})
-  llvm.func @llvm_align_decl(!llvm.ptr<f32> {llvm.align = 4})
-  // CHECK: llvm.func @inallocaattr_decl(!llvm.ptr<i32> {llvm.inalloca = i32})
-  llvm.func @inallocaattr_decl(!llvm.ptr<i32> {llvm.inalloca = i32})
+  // CHECK: llvm.func @llvm_align_decl(!llvm.ptr {llvm.align = 4 : i64})
+  llvm.func @llvm_align_decl(!llvm.ptr {llvm.align = 4})
+  // CHECK: llvm.func @inallocaattr_decl(!llvm.ptr {llvm.inalloca = i32})
+  llvm.func @inallocaattr_decl(!llvm.ptr {llvm.inalloca = i32})
 
 
   // CHECK: llvm.func @variadic(...)
diff --git a/mlir/test/Dialect/LLVMIR/global-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/global-typed-pointers.mlir
new file mode 100644
index 0000000000000..56d720cc866b6
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/global-typed-pointers.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+// CHECK: llvm.mlir.global internal @global(42 : i64) {addr_space = 0 : i32} : i64
+llvm.mlir.global internal @global(42 : i64) : i64
+
+// CHECK: llvm.mlir.global internal constant @".string"("foobar")
+llvm.mlir.global internal constant @".string"("foobar") : !llvm.array<6 x i8>
+
+func.func @references() {
+  // CHECK: llvm.mlir.addressof @global : !llvm.ptr<i64>
+  %0 = llvm.mlir.addressof @global : !llvm.ptr<i64>
+
+  // CHECK: llvm.mlir.addressof @".string" : !llvm.ptr<array<6 x i8>>
+  %1 = llvm.mlir.addressof @".string" : !llvm.ptr<array<6 x i8>>
+
+  llvm.return
+}
+
+// -----
+
+llvm.mlir.global internal @foo(0: i32) : i32
+
+func.func @bar() {
+  // expected-error @+1 {{the type must be a pointer to the type of the referenced global}}
+  llvm.mlir.addressof @foo : !llvm.ptr<i64>
+  llvm.return
+}
+
+// -----
+
+llvm.func @foo()
+
+llvm.func @bar() {
+  // expected-error @+1 {{the type must be a pointer to the type of the referenced function}}
+  llvm.mlir.addressof @foo : !llvm.ptr<i8>
+  llvm.return
+}
+
+// -----
+
+llvm.mlir.global internal @g(32 : i64) {addr_space = 3: i32} : i64
+func.func @mismatch_addr_space() {
+  // expected-error @+1 {{pointer address space must match address space of the referenced global}}
+  llvm.mlir.addressof @g : !llvm.ptr<i64, 4>
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/global.mlir b/mlir/test/Dialect/LLVMIR/global.mlir
index 2f0850834a0ef..aff116db5dcca 100644
--- a/mlir/test/Dialect/LLVMIR/global.mlir
+++ b/mlir/test/Dialect/LLVMIR/global.mlir
@@ -66,17 +66,14 @@ llvm.mlir.global external @has_addr_space(32 : i64) {addr_space = 3: i32} : i64
 
 // CHECK-LABEL: references
 func.func @references() {
-  // CHECK: llvm.mlir.addressof @global : !llvm.ptr<i64>
-  %0 = llvm.mlir.addressof @global : !llvm.ptr<i64>
-
-  // CHECK: llvm.mlir.addressof @".string" : !llvm.ptr<array<6 x i8>>
-  %1 = llvm.mlir.addressof @".string" : !llvm.ptr<array<6 x i8>>
+  // CHECK: llvm.mlir.addressof @".string" : !llvm.ptr
+  %0 = llvm.mlir.addressof @".string" : !llvm.ptr
 
   // CHECK: llvm.mlir.addressof @global : !llvm.ptr
-  %2 = llvm.mlir.addressof @global : !llvm.ptr
+  %1 = llvm.mlir.addressof @global : !llvm.ptr
 
   // CHECK: llvm.mlir.addressof @has_addr_space : !llvm.ptr<3>
-  %3 = llvm.mlir.addressof @has_addr_space : !llvm.ptr<3>
+  %2 = llvm.mlir.addressof @has_addr_space : !llvm.ptr<3>
 
   llvm.return
 }
@@ -164,7 +161,7 @@ func.func @foo() {
   // The attribute parser will consume the first colon-type, so we put two of
   // them to trigger the attribute type mismatch error.
   // expected-error @+1 {{invalid kind of attribute specified}}
-  llvm.mlir.addressof "foo" : i64 : !llvm.ptr<func<void ()>>
+  llvm.mlir.addressof "foo" : i64 : !llvm.ptr
   llvm.return
 }
 
@@ -172,27 +169,7 @@ func.func @foo() {
 
 func.func @foo() {
   // expected-error @+1 {{must reference a global defined by 'llvm.mlir.global'}}
-  llvm.mlir.addressof @foo : !llvm.ptr<func<void ()>>
-  llvm.return
-}
-
-// -----
-
-llvm.mlir.global internal @foo(0: i32) : i32
-
-func.func @bar() {
-  // expected-error @+1 {{the type must be a pointer to the type of the referenced global}}
-  llvm.mlir.addressof @foo : !llvm.ptr<i64>
-  llvm.return
-}
-
-// -----
-
-llvm.func @foo()
-
-llvm.func @bar() {
-  // expected-error @+1 {{the type must be a pointer to the type of the referenced function}}
-  llvm.mlir.addressof @foo : !llvm.ptr<i8>
+  llvm.mlir.addressof @foo : !llvm.ptr
   llvm.return
 }
 
@@ -224,23 +201,15 @@ llvm.mlir.global internal @g(43 : i64) : i64 {
 llvm.mlir.global internal @g(32 : i64) {addr_space = 3: i32} : i64
 func.func @mismatch_addr_space_implicit_global() {
   // expected-error @+1 {{pointer address space must match address space of the referenced global}}
-  llvm.mlir.addressof @g : !llvm.ptr<i64>
+  llvm.mlir.addressof @g : !llvm.ptr
   llvm.return
 }
 
 // -----
 
 llvm.mlir.global internal @g(32 : i64) {addr_space = 3: i32} : i64
-func.func @mismatch_addr_space() {
-  // expected-error @+1 {{pointer address space must match address space of the referenced global}}
-  llvm.mlir.addressof @g : !llvm.ptr<i64, 4>
-  llvm.return
-}
-// -----
 
-llvm.mlir.global internal @g(32 : i64) {addr_space = 3: i32} : i64
-
-func.func @mismatch_addr_space_opaque() {
+func.func @mismatch_addr_space() {
   // expected-error @+1 {{pointer address space must match address space of the referenced global}}
   llvm.mlir.addressof @g : !llvm.ptr<4>
   llvm.return
diff --git a/mlir/test/Dialect/LLVMIR/invalid-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/invalid-typed-pointers.mlir
new file mode 100644
index 0000000000000..033b84d04ef87
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/invalid-typed-pointers.mlir
@@ -0,0 +1,283 @@
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -verify-diagnostics
+
+func.func @alloca_ptr_type_attr_non_opaque_ptr(%sz : i64) {
+  // expected-error@below {{unexpected 'elem_type' attribute when non-opaque pointer type is used}}
+  "llvm.alloca"(%sz) { elem_type = i32 } : (i64) -> !llvm.ptr<i32>
+}
+
+// -----
+
+func.func @gep_missing_input_type(%pos : i64, %base : !llvm.ptr<f32>) {
+  // expected-error@+1 {{2 operands present, but expected 0}}
+  llvm.getelementptr %base[%pos] : () -> (!llvm.ptr<f32>)
+}
+
+// -----
+
+func.func @gep_missing_result_type(%pos : i64, %base : !llvm.ptr<f32>) {
+  // expected-error@+1 {{op requires one result}}
+  llvm.getelementptr %base[%pos] : (!llvm.ptr<f32>, i64) -> ()
+}
+
+// -----
+
+func.func @gep_non_function_type(%pos : i64, %base : !llvm.ptr<f32>) {
+  // expected-error@+1 {{invalid kind of type specified}}
+  llvm.getelementptr %base[%pos] : !llvm.ptr<f32>
+}
+
+// -----
+
+func.func @gep_too_few_dynamic(%base : !llvm.ptr<f32>) {
+  // expected-error@+1 {{expected as many dynamic indices as specified in 'rawConstantIndices'}}
+  %1 = "llvm.getelementptr"(%base) {rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
+}
+
+// -----
+
+func.func @call_variadic(%callee : !llvm.ptr<func<i8 (i8, ...)>>, %arg : i8) {
+  // expected-error@+1 {{indirect calls to variadic functions are not supported}}
+  llvm.call %callee(%arg) : !llvm.ptr<func<i8 (i8, ...)>>, (i8) -> (i8)
+  llvm.return
+}
+
+// -----
+
+func.func @indirect_callee_arg_mismatch(%arg0 : i32, %callee : !llvm.ptr<func<void(i8)>>) {
+  // expected-error@+1 {{'llvm.call' op operand type mismatch for operand 0: 'i32' != 'i8'}}
+  "llvm.call"(%callee, %arg0) : (!llvm.ptr<func<void(i8)>>, i32) -> ()
+  llvm.return
+}
+
+// -----
+
+func.func @indirect_callee_return_mismatch(%callee : !llvm.ptr<func<i8()>>) {
+  // expected-error@+1 {{'llvm.call' op result type mismatch: 'i32' != 'i8'}}
+  "llvm.call"(%callee) : (!llvm.ptr<func<i8()>>) -> (i32)
+  llvm.return
+}
+
+// -----
+
+func.func @atomicrmw_mismatched_operands(%f32_ptr : !llvm.ptr<f32>, %i32 : i32) {
+  // expected-error@+1 {{expected LLVM IR element type for operand #0 to match type for operand #1}}
+  %0 = "llvm.atomicrmw"(%f32_ptr, %i32) {bin_op=11, ordering=1} : (!llvm.ptr<f32>, i32) -> i32
+  llvm.return
+}
+
+// -----
+
+func.func @cmpxchg_expected_ptr(%f32 : f32) {
+  // expected-error@+1 {{op operand #0 must be LLVM pointer to integer or LLVM pointer type}}
+  %0 = "llvm.cmpxchg"(%f32, %f32, %f32) {success_ordering=2,failure_ordering=2} : (f32, f32, f32) -> !llvm.struct<(f32, i1)>
+  llvm.return
+}
+
+// -----
+
+func.func @cmpxchg_mismatched_operands(%i64_ptr : !llvm.ptr<i64>, %i32 : i32) {
+  // expected-error@+1 {{expected LLVM IR element type for operand #0 to match type for all other operands}}
+  %0 = "llvm.cmpxchg"(%i64_ptr, %i32, %i32) {success_ordering=2,failure_ordering=2} : (!llvm.ptr<i64>, i32, i32) -> !llvm.struct<(i32, i1)>
+  llvm.return
+}
+
+// -----
+
+llvm.func @foo(i32) -> i32
+llvm.func @__gxx_personality_v0(...) -> i32
+
+llvm.func @bad_landingpad(%arg0: !llvm.ptr<ptr<i8>>) -> i32 attributes { personality = @__gxx_personality_v0} {
+  %0 = llvm.mlir.constant(3 : i32) : i32
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.invoke @foo(%1) to ^bb1 unwind ^bb2 : (i32) -> i32
+^bb1:  // pred: ^bb0
+  llvm.return %1 : i32
+^bb2:  // pred: ^bb0
+  // expected-error@+1 {{clause #0 is not a known constant - null, addressof, bitcast}}
+  %3 = llvm.landingpad cleanup (catch %1 : i32) (catch %arg0 : !llvm.ptr<ptr<i8>>) : !llvm.struct<(ptr<i8>, i32)>
+  llvm.return %0 : i32
+}
+
+// -----
+
+llvm.func @foo(i32) -> i32
+llvm.func @__gxx_personality_v0(...) -> i32
+
+llvm.func @caller(%arg0: i32) -> i32 attributes { personality = @__gxx_personality_v0} {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.ptr<i8> : (i32) -> !llvm.ptr<ptr<i8>>
+  // expected-note@+1 {{global addresses expected as operand to bitcast used in clauses for landingpad}}
+  %2 = llvm.bitcast %1 : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
+  %3 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (i32) -> i32
+^bb1: // pred: ^bb0
+  llvm.return %0 : i32
+^bb2: // pred: ^bb0
+  // expected-error@+1 {{constant clauses expected}}
+  %5 = llvm.landingpad (catch %2 : !llvm.ptr<i8>) : !llvm.struct<(ptr<i8>, i32)>
+  llvm.return %0 : i32
+}
+
+// -----
+
+llvm.func @foo(i32) -> i32
+llvm.func @__gxx_personality_v0(...) -> i32
+
+llvm.func @caller(%arg0: i32) -> i32 attributes { personality = @__gxx_personality_v0} {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (i32) -> i32
+^bb1: // pred: ^bb0
+  llvm.return %0 : i32
+^bb2: // pred: ^bb0
+  // expected-error@+1 {{landingpad instruction expects at least one clause or cleanup attribute}}
+  %2 = llvm.landingpad : !llvm.struct<(ptr<i8>, i32)>
+  llvm.return %0 : i32
+}
+
+// -----
+
+llvm.func @foo(i32) -> i32
+llvm.func @__gxx_personality_v0(...) -> i32
+
+llvm.func @caller(%arg0: i32) -> i32 attributes { personality = @__gxx_personality_v0 } {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (i32) -> i32
+^bb1: // pred: ^bb0
+  llvm.return %0 : i32
+^bb2: // pred: ^bb0
+  %2 = llvm.landingpad cleanup : !llvm.struct<(ptr<i8>, i32)>
+  // expected-error@+1 {{'llvm.resume' op expects landingpad value as operand}}
+  llvm.resume %0 : i32
+}
+
+// -----
+
+llvm.func @foo(i32) -> i32
+
+llvm.func @caller(%arg0: i32) -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (i32) -> i32
+^bb1: // pred: ^bb0
+  llvm.return %0 : i32
+^bb2: // pred: ^bb0
+  // expected-error@+1 {{llvm.landingpad needs to be in a function with a personality}}
+  %2 = llvm.landingpad cleanup : !llvm.struct<(ptr<i8>, i32)>
+  llvm.resume %2 : !llvm.struct<(ptr<i8>, i32)>
+}
+
+// -----
+
+llvm.func @wmmaLoadOp_invalid_mem_space(%arg0: !llvm.ptr<5>, %arg1: i32) {
+  // expected-error@+1 {{'nvvm.wmma.load' op expected source pointer in memory space 0, 1, 3}}
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
+    : (!llvm.ptr<5>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+  llvm.return
+}
+
+// -----
+
+llvm.func @wmmaLoadOp_invalid_AOp(%arg0: !llvm.ptr<3>, %arg1: i32) {
+  // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 8 elements of type 'vector<2xf16>'}}
+  %0 = nvvm.wmma.load %arg0, %arg1
+  {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
+  : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+  llvm.return
+}
+
+// -----
+
+llvm.func @wmmaLoadOp_invalid_BOp(%arg0: !llvm.ptr<3>, %arg1: i32) {
+  // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 8 elements of type 'vector<2xf16>'}}
+ %0 = nvvm.wmma.load %arg0, %arg1
+ {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<b>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
+ : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+
+  llvm.return
+}
+
+// -----
+
+llvm.func @wmmaLoadOp_invalid_COp(%arg0: !llvm.ptr<3>, %arg1: i32) {
+  // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 4 elements of type 'vector<2xf16>'}}
+ %0 = nvvm.wmma.load %arg0, %arg1
+   {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<c>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
+   : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
+
+  llvm.return
+}
+
+// -----
+
+llvm.func @wmmaStoreOp_invalid_mem_space(%arg0: !llvm.ptr<5>, %arg1: i32,
+                            %arg2: vector<2 x f16>, %arg3: vector<2 x f16>,
+                            %arg4: vector<2 x f16>, %arg5: vector<2 xf16>) {
+  // expected-error@+1 {{'nvvm.wmma.store' op expected operands to be a source pointer in memory space 0, 1, 3}}
+  nvvm.wmma.store %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
+    {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
+    : !llvm.ptr<5>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32>) {
+  // expected-error@+1 {{'nvvm.ldmatrix' op expected source pointer in memory space 3}}
+  %l = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32>) -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32, 3>) {
+  // expected-error@+1 {{'nvvm.ldmatrix' op expected num attribute to be 1, 2 or 4}}
+  %l = nvvm.ldmatrix %arg0 {num = 3 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32, 3>) {
+  // expected-error@+1 {{'nvvm.ldmatrix' op expected destination type is i32}}
+  %l = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32)>
+  llvm.return
+}
+
+// -----
+
+llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32, 3>) {
+  // expected-error@+1 {{'nvvm.ldmatrix' op expected destination type is a structure of 4 elements of type i32}}
+  %l = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
+  llvm.return
+}
+
+// -----
+
+func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
+  // expected-error @below {{expected byte size to be either 4, 8 or 16.}}
+  nvvm.cp.async.shared.global %arg0, %arg1, 32 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  return
+}
+
+// -----
+
+func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
+  // expected-error @below {{bypass l1 is only support for 16 bytes copy.}}
+  nvvm.cp.async.shared.global %arg0, %arg1, 8 {bypass_l1} : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  return
+}
+
+// -----
+
+func.func @gep_struct_variable(%arg0: !llvm.ptr<struct<(i32)>>, %arg1: i32, %arg2: i32) {
+  // expected-error @below {{op expected index 1 indexing a struct to be constant}}
+  llvm.getelementptr %arg0[%arg1, %arg1] : (!llvm.ptr<struct<(i32)>>, i32, i32) -> !llvm.ptr<i32>
+  return
+}
+
+// -----
+
+func.func @gep_out_of_bounds(%ptr: !llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, %idx: i64) {
+  // expected-error @below {{index 2 indexing a struct is out of bounds}}
+  llvm.getelementptr %ptr[%idx, 1, 3] : (!llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, i64) -> !llvm.ptr<i32>
+  return
+}
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 3e019144a199b..c3af84e55b881 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -64,7 +64,7 @@ func.func @alloca_missing_input_result_type(%size : i64) {
 
 func.func @alloca_missing_input_type() {
   // expected-error@+1 {{expected trailing function type with one argument and one result}}
-  llvm.alloca %size x i32 : () -> (!llvm.ptr<i32>)
+  llvm.alloca %size x i32 : () -> (!llvm.ptr)
 }
 
 // -----
@@ -78,14 +78,14 @@ func.func @alloca_missing_result_type() {
 
 func.func @alloca_non_function_type() {
   // expected-error@+1 {{expected trailing function type with one argument and one result}}
-  llvm.alloca %size x i32 : !llvm.ptr<i32>
+  llvm.alloca %size x i32 : !llvm.ptr
 }
 
 // -----
 
 func.func @alloca_non_integer_alignment() {
   // expected-error@+1 {{expected integer alignment}}
-  llvm.alloca %size x i32 {alignment = 3.0} : !llvm.ptr<i32>
+  llvm.alloca %size x i32 {alignment = 3.0} : !llvm.ptr
 }
 
 // -----
@@ -97,44 +97,37 @@ func.func @alloca_opaque_ptr_no_type(%sz : i64) {
 
 // -----
 
-func.func @alloca_ptr_type_attr_non_opaque_ptr(%sz : i64) {
-  // expected-error@below {{unexpected 'elem_type' attribute when non-opaque pointer type is used}}
-  "llvm.alloca"(%sz) { elem_type = i32 } : (i64) -> !llvm.ptr<i32>
-}
-
-// -----
-
-func.func @gep_missing_input_result_type(%pos : i64, %base : !llvm.ptr<f32>) {
+func.func @gep_missing_input_result_type(%pos : i64, %base : !llvm.ptr) {
   // expected-error@+1 {{2 operands present, but expected 0}}
   llvm.getelementptr %base[%pos] : () -> ()
 }
 
 // -----
 
-func.func @gep_missing_input_type(%pos : i64, %base : !llvm.ptr<f32>) {
+func.func @gep_missing_input_type(%pos : i64, %base : !llvm.ptr) {
   // expected-error@+1 {{2 operands present, but expected 0}}
-  llvm.getelementptr %base[%pos] : () -> (!llvm.ptr<f32>)
+  llvm.getelementptr %base[%pos] : () -> (!llvm.ptr)
 }
 
 // -----
 
-func.func @gep_missing_result_type(%pos : i64, %base : !llvm.ptr<f32>) {
+func.func @gep_missing_result_type(%pos : i64, %base : !llvm.ptr) {
   // expected-error@+1 {{op requires one result}}
-  llvm.getelementptr %base[%pos] : (!llvm.ptr<f32>, i64) -> ()
+  llvm.getelementptr %base[%pos] : (!llvm.ptr, i64) -> ()
 }
 
 // -----
 
-func.func @gep_non_function_type(%pos : i64, %base : !llvm.ptr<f32>) {
+func.func @gep_non_function_type(%pos : i64, %base : !llvm.ptr) {
   // expected-error@+1 {{invalid kind of type specified}}
-  llvm.getelementptr %base[%pos] : !llvm.ptr<f32>
+  llvm.getelementptr %base[%pos] : !llvm.ptr
 }
 
 // -----
 
-func.func @gep_too_few_dynamic(%base : !llvm.ptr<f32>) {
+func.func @gep_too_few_dynamic(%base : !llvm.ptr) {
   // expected-error@+1 {{expected as many dynamic indices as specified in 'rawConstantIndices'}}
-  %1 = "llvm.getelementptr"(%base) {rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
+  %1 = "llvm.getelementptr"(%base) {elem_type = f32, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr) -> !llvm.ptr
 }
 
 // -----
@@ -302,14 +295,6 @@ func.func @call_unknown_symbol() {
 
 // -----
 
-func.func @call_variadic(%callee : !llvm.ptr<func<i8 (i8, ...)>>, %arg : i8) {
-  // expected-error@+1 {{indirect calls to variadic functions are not supported}}
-  llvm.call %callee(%arg) : !llvm.ptr<func<i8 (i8, ...)>>, (i8) -> (i8)
-  llvm.return
-}
-
-// -----
-
 func.func private @standard_func_callee()
 
 func.func @call_non_llvm() {
@@ -346,14 +331,6 @@ func.func @callee_arg_mismatch(%arg0 : i32) {
 
 // -----
 
-func.func @indirect_callee_arg_mismatch(%arg0 : i32, %callee : !llvm.ptr<func<void(i8)>>) {
-  // expected-error@+1 {{'llvm.call' op operand type mismatch for operand 0: 'i32' != 'i8'}}
-  "llvm.call"(%callee, %arg0) : (!llvm.ptr<func<void(i8)>>, i32) -> ()
-  llvm.return
-}
-
-// -----
-
 llvm.func @callee_func() -> (i8)
 
 func.func @callee_return_mismatch() {
@@ -364,14 +341,6 @@ func.func @callee_return_mismatch() {
 
 // -----
 
-func.func @indirect_callee_return_mismatch(%callee : !llvm.ptr<func<i8()>>) {
-  // expected-error@+1 {{'llvm.call' op result type mismatch: 'i32' != 'i8'}}
-  "llvm.call"(%callee) : (!llvm.ptr<func<i8()>>) -> (i32)
-  llvm.return
-}
-
-// -----
-
 func.func @call_too_many_results(%callee : !llvm.ptr) {
   // expected-error@+1 {{expected function with 0 or 1 result}}
   llvm.call %callee() : !llvm.ptr, () -> (i32, i32)
@@ -406,14 +375,14 @@ llvm.func @func_result_mismatch(%arg0: f32) -> i32 {
 
 func.func @constant_wrong_type() {
   // expected-error@+1 {{only supports integer, float, string or elements attributes}}
-  llvm.mlir.constant(@constant_wrong_type) : !llvm.ptr<func<void ()>>
+  llvm.mlir.constant(@constant_wrong_type) : !llvm.ptr
 }
 
 // -----
 
 func.func @constant_wrong_type_string() {
   // expected-error@below {{expected array type of 3 i8 elements for the string constant}}
-  llvm.mlir.constant("foo") : !llvm.ptr<i8>
+  llvm.mlir.constant("foo") : !llvm.ptr
 }
 
 // -----
@@ -671,47 +640,39 @@ func.func @atomicrmw_expected_ptr(%f32 : f32) {
 
 // -----
 
-func.func @atomicrmw_mismatched_operands(%f32_ptr : !llvm.ptr<f32>, %i32 : i32) {
-  // expected-error@+1 {{expected LLVM IR element type for operand #0 to match type for operand #1}}
-  %0 = "llvm.atomicrmw"(%f32_ptr, %i32) {bin_op=11, ordering=1} : (!llvm.ptr<f32>, i32) -> i32
-  llvm.return
-}
-
-// -----
-
-func.func @atomicrmw_mismatched_operands(%f32_ptr : !llvm.ptr<f32>, %f32 : f32) {
+func.func @atomicrmw_mismatched_operands(%f32_ptr : !llvm.ptr, %f32 : f32) {
   // expected-error@+1 {{op failed to verify that result #0 and operand #1 have the same type}}
-  %0 = "llvm.atomicrmw"(%f32_ptr, %f32) {bin_op=11, ordering=1} : (!llvm.ptr<f32>, f32) -> i32
+  %0 = "llvm.atomicrmw"(%f32_ptr, %f32) {bin_op=11, ordering=1} : (!llvm.ptr, f32) -> i32
   llvm.return
 }
 
 // -----
 
-func.func @atomicrmw_expected_float(%i32_ptr : !llvm.ptr<i32>, %i32 : i32) {
+func.func @atomicrmw_expected_float(%i32_ptr : !llvm.ptr, %i32 : i32) {
   // expected-error@+1 {{expected LLVM IR floating point type}}
-  %0 = llvm.atomicrmw fadd %i32_ptr, %i32 unordered : !llvm.ptr<i32>, i32
+  %0 = llvm.atomicrmw fadd %i32_ptr, %i32 unordered : !llvm.ptr, i32
   llvm.return
 }
 
 // -----
 
-func.func @atomicrmw_unexpected_xchg_type(%i1_ptr : !llvm.ptr<i1>, %i1 : i1) {
+func.func @atomicrmw_unexpected_xchg_type(%i1_ptr : !llvm.ptr, %i1 : i1) {
   // expected-error@+1 {{unexpected LLVM IR type for 'xchg' bin_op}}
-  %0 = llvm.atomicrmw xchg %i1_ptr, %i1 unordered : !llvm.ptr<i1>, i1
+  %0 = llvm.atomicrmw xchg %i1_ptr, %i1 unordered : !llvm.ptr, i1
   llvm.return
 }
 
 // -----
 
-func.func @atomicrmw_expected_int(%f32_ptr : !llvm.ptr<f32>, %f32 : f32) {
+func.func @atomicrmw_expected_int(%f32_ptr : !llvm.ptr, %f32 : f32) {
   // expected-error@+1 {{expected LLVM IR integer type}}
-  %0 = llvm.atomicrmw max %f32_ptr, %f32 unordered : !llvm.ptr<f32>, f32
+  %0 = llvm.atomicrmw max %f32_ptr, %f32 unordered : !llvm.ptr, f32
   llvm.return
 }
 
 // -----
 
-func.func @cmpxchg_expected_ptr(%f32_ptr : !llvm.ptr<f32>, %f32 : f32) {
+func.func @cmpxchg_expected_ptr(%f32 : f32) {
   // expected-error@+1 {{op operand #0 must be LLVM pointer to integer or LLVM pointer type}}
   %0 = "llvm.cmpxchg"(%f32, %f32, %f32) {success_ordering=2,failure_ordering=2} : (f32, f32, f32) -> !llvm.struct<(f32, i1)>
   llvm.return
@@ -719,14 +680,6 @@ func.func @cmpxchg_expected_ptr(%f32_ptr : !llvm.ptr<f32>, %f32 : f32) {
 
 // -----
 
-func.func @cmpxchg_mismatched_operands(%i64_ptr : !llvm.ptr<i64>, %i32 : i32) {
-  // expected-error@+1 {{expected LLVM IR element type for operand #0 to match type for all other operands}}
-  %0 = "llvm.cmpxchg"(%i64_ptr, %i32, %i32) {success_ordering=2,failure_ordering=2} : (!llvm.ptr<i64>, i32, i32) -> !llvm.struct<(i32, i1)>
-  llvm.return
-}
-
-// -----
-
 func.func @cmpxchg_mismatched_value_operands(%ptr : !llvm.ptr, %i32 : i32, %i64 : i64) {
   // expected-error@+1 {{op failed to verify that operand #1 and operand #2 have the same type}}
   %0 = "llvm.cmpxchg"(%ptr, %i32, %i64) {success_ordering=2,failure_ordering=2} : (!llvm.ptr, i32, i64) -> !llvm.struct<(i32, i1)>
@@ -743,41 +696,41 @@ func.func @cmpxchg_mismatched_result(%ptr : !llvm.ptr, %i64 : i64) {
 
 // -----
 
-func.func @cmpxchg_unexpected_type(%i1_ptr : !llvm.ptr<i1>, %i1 : i1) {
+func.func @cmpxchg_unexpected_type(%i1_ptr : !llvm.ptr, %i1 : i1) {
   // expected-error@+1 {{unexpected LLVM IR type}}
-  %0 = llvm.cmpxchg %i1_ptr, %i1, %i1 monotonic monotonic : !llvm.ptr<i1>, i1
+  %0 = llvm.cmpxchg %i1_ptr, %i1, %i1 monotonic monotonic : !llvm.ptr, i1
   llvm.return
 }
 
 // -----
 
-func.func @cmpxchg_at_least_monotonic_success(%i32_ptr : !llvm.ptr<i32>, %i32 : i32) {
+func.func @cmpxchg_at_least_monotonic_success(%i32_ptr : !llvm.ptr, %i32 : i32) {
   // expected-error@+1 {{ordering must be at least 'monotonic'}}
-  %0 = llvm.cmpxchg %i32_ptr, %i32, %i32 unordered monotonic : !llvm.ptr<i32>, i32
+  %0 = llvm.cmpxchg %i32_ptr, %i32, %i32 unordered monotonic : !llvm.ptr, i32
   llvm.return
 }
 
 // -----
 
-func.func @cmpxchg_at_least_monotonic_failure(%i32_ptr : !llvm.ptr<i32>, %i32 : i32) {
+func.func @cmpxchg_at_least_monotonic_failure(%i32_ptr : !llvm.ptr, %i32 : i32) {
   // expected-error@+1 {{ordering must be at least 'monotonic'}}
-  %0 = llvm.cmpxchg %i32_ptr, %i32, %i32 monotonic unordered : !llvm.ptr<i32>, i32
+  %0 = llvm.cmpxchg %i32_ptr, %i32, %i32 monotonic unordered : !llvm.ptr, i32
   llvm.return
 }
 
 // -----
 
-func.func @cmpxchg_failure_release(%i32_ptr : !llvm.ptr<i32>, %i32 : i32) {
+func.func @cmpxchg_failure_release(%i32_ptr : !llvm.ptr, %i32 : i32) {
   // expected-error@+1 {{failure ordering cannot be 'release' or 'acq_rel'}}
-  %0 = llvm.cmpxchg %i32_ptr, %i32, %i32 acq_rel release : !llvm.ptr<i32>, i32
+  %0 = llvm.cmpxchg %i32_ptr, %i32, %i32 acq_rel release : !llvm.ptr, i32
   llvm.return
 }
 
 // -----
 
-func.func @cmpxchg_failure_acq_rel(%i32_ptr : !llvm.ptr<i32>, %i32 : i32) {
+func.func @cmpxchg_failure_acq_rel(%i32_ptr : !llvm.ptr, %i32 : i32) {
   // expected-error@+1 {{failure ordering cannot be 'release' or 'acq_rel'}}
-  %0 = llvm.cmpxchg %i32_ptr, %i32, %i32 acq_rel acq_rel : !llvm.ptr<i32>, i32
+  %0 = llvm.cmpxchg %i32_ptr, %i32, %i32 acq_rel acq_rel : !llvm.ptr, i32
   llvm.return
 }
 
@@ -786,7 +739,7 @@ func.func @cmpxchg_failure_acq_rel(%i32_ptr : !llvm.ptr<i32>, %i32 : i32) {
 llvm.func @foo(i32) -> i32
 llvm.func @__gxx_personality_v0(...) -> i32
 
-llvm.func @bad_landingpad(%arg0: !llvm.ptr<ptr<i8>>) -> i32 attributes { personality = @__gxx_personality_v0} {
+llvm.func @bad_landingpad(%arg0: !llvm.ptr) -> i32 attributes { personality = @__gxx_personality_v0} {
   %0 = llvm.mlir.constant(3 : i32) : i32
   %1 = llvm.mlir.constant(2 : i32) : i32
   %2 = llvm.invoke @foo(%1) to ^bb1 unwind ^bb2 : (i32) -> i32
@@ -794,7 +747,7 @@ llvm.func @bad_landingpad(%arg0: !llvm.ptr<ptr<i8>>) -> i32 attributes { persona
   llvm.return %1 : i32
 ^bb2:  // pred: ^bb0
   // expected-error@+1 {{clause #0 is not a known constant - null, addressof, bitcast}}
-  %3 = llvm.landingpad cleanup (catch %1 : i32) (catch %arg0 : !llvm.ptr<ptr<i8>>) : !llvm.struct<(ptr<i8>, i32)>
+  %3 = llvm.landingpad cleanup (catch %1 : i32) (catch %arg0 : !llvm.ptr) : !llvm.struct<(ptr, i32)>
   llvm.return %0 : i32
 }
 
@@ -805,15 +758,15 @@ llvm.func @__gxx_personality_v0(...) -> i32
 
 llvm.func @caller(%arg0: i32) -> i32 attributes { personality = @__gxx_personality_v0} {
   %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.alloca %0 x !llvm.ptr<i8> : (i32) -> !llvm.ptr<ptr<i8>>
+  %1 = llvm.alloca %0 x !llvm.ptr : (i32) -> !llvm.ptr
   // expected-note@+1 {{global addresses expected as operand to bitcast used in clauses for landingpad}}
-  %2 = llvm.bitcast %1 : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
+  %2 = llvm.bitcast %1 : !llvm.ptr to !llvm.ptr
   %3 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (i32) -> i32
 ^bb1: // pred: ^bb0
   llvm.return %0 : i32
 ^bb2: // pred: ^bb0
   // expected-error@+1 {{constant clauses expected}}
-  %5 = llvm.landingpad (catch %2 : !llvm.ptr<i8>) : !llvm.struct<(ptr<i8>, i32)>
+  %5 = llvm.landingpad (catch %2 : !llvm.ptr) : !llvm.struct<(ptr, i32)>
   llvm.return %0 : i32
 }
 
@@ -829,7 +782,7 @@ llvm.func @caller(%arg0: i32) -> i32 attributes { personality = @__gxx_personali
   llvm.return %0 : i32
 ^bb2: // pred: ^bb0
   // expected-error@+1 {{landingpad instruction expects at least one clause or cleanup attribute}}
-  %2 = llvm.landingpad : !llvm.struct<(ptr<i8>, i32)>
+  %2 = llvm.landingpad : !llvm.struct<(ptr, i32)>
   llvm.return %0 : i32
 }
 
@@ -844,7 +797,7 @@ llvm.func @caller(%arg0: i32) -> i32 attributes { personality = @__gxx_personali
 ^bb1: // pred: ^bb0
   llvm.return %0 : i32
 ^bb2: // pred: ^bb0
-  %2 = llvm.landingpad cleanup : !llvm.struct<(ptr<i8>, i32)>
+  %2 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
   // expected-error@+1 {{'llvm.resume' op expects landingpad value as operand}}
   llvm.resume %0 : i32
 }
@@ -860,8 +813,8 @@ llvm.func @caller(%arg0: i32) -> i32 {
   llvm.return %0 : i32
 ^bb2: // pred: ^bb0
   // expected-error@+1 {{llvm.landingpad needs to be in a function with a personality}}
-  %2 = llvm.landingpad cleanup : !llvm.struct<(ptr<i8>, i32)>
-  llvm.resume %2 : !llvm.struct<(ptr<i8>, i32)>
+  %2 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
+  llvm.resume %2 : !llvm.struct<(ptr, i32)>
 }
 
 // -----
@@ -1056,55 +1009,55 @@ module {
 
 // -----
 
-llvm.func @wmmaLoadOp_invalid_mem_space(%arg0: !llvm.ptr<i32, 5>, %arg1: i32) {
+llvm.func @wmmaLoadOp_invalid_mem_space(%arg0: !llvm.ptr<5>, %arg1: i32) {
   // expected-error@+1 {{'nvvm.wmma.load' op expected source pointer in memory space 0, 1, 3}}
   %0 = nvvm.wmma.load %arg0, %arg1
     {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
-    : (!llvm.ptr<i32, 5>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+    : (!llvm.ptr<5>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
   llvm.return
 }
 
 // -----
 
-llvm.func @wmmaLoadOp_invalid_AOp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
+llvm.func @wmmaLoadOp_invalid_AOp(%arg0: !llvm.ptr<3>, %arg1: i32) {
   // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 8 elements of type 'vector<2xf16>'}}
   %0 = nvvm.wmma.load %arg0, %arg1
   {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
-  : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+  : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
   llvm.return
 }
 
 // -----
 
-llvm.func @wmmaLoadOp_invalid_BOp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
+llvm.func @wmmaLoadOp_invalid_BOp(%arg0: !llvm.ptr<3>, %arg1: i32) {
   // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 8 elements of type 'vector<2xf16>'}}
  %0 = nvvm.wmma.load %arg0, %arg1
  {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<b>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
- : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+ : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 
   llvm.return
 }
 
 // -----
 
-llvm.func @wmmaLoadOp_invalid_COp(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
+llvm.func @wmmaLoadOp_invalid_COp(%arg0: !llvm.ptr<3>, %arg1: i32) {
   // expected-error@+1 {{'nvvm.wmma.load' op expected destination type is a structure of 4 elements of type 'vector<2xf16>'}}
  %0 = nvvm.wmma.load %arg0, %arg1
    {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<c>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
-   : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
+   : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
 
   llvm.return
 }
 
 // -----
 
-llvm.func @wmmaStoreOp_invalid_mem_space(%arg0: !llvm.ptr<i32, 5>, %arg1: i32,
+llvm.func @wmmaStoreOp_invalid_mem_space(%arg0: !llvm.ptr<5>, %arg1: i32,
                             %arg2: vector<2 x f16>, %arg3: vector<2 x f16>,
                             %arg4: vector<2 x f16>, %arg5: vector<2 xf16>) {
   // expected-error@+1 {{'nvvm.wmma.store' op expected operands to be a source pointer in memory space 0, 1, 3}}
   nvvm.wmma.store %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
     {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
-    : !llvm.ptr<i32, 5>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
+    : !llvm.ptr<5>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
   llvm.return
 }
 
@@ -1208,33 +1161,33 @@ llvm.func @gpu_wmma_mma_op_invalid_result(%arg0: vector<2 x f16>, %arg1: vector<
 
 // -----
 
-llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32>) {
+llvm.func @wmmald_matrix(%arg0: !llvm.ptr) {
   // expected-error@+1 {{'nvvm.ldmatrix' op expected source pointer in memory space 3}}
-  %l = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32>) -> i32
+  %l = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr) -> i32
   llvm.return
 }
 
 // -----
 
-llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32, 3>) {
+llvm.func @wmmald_matrix(%arg0: !llvm.ptr<3>) {
   // expected-error@+1 {{'nvvm.ldmatrix' op expected num attribute to be 1, 2 or 4}}
-  %l = nvvm.ldmatrix %arg0 {num = 3 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> i32
+  %l = nvvm.ldmatrix %arg0 {num = 3 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> i32
   llvm.return
 }
 
 // -----
 
-llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32, 3>) {
+llvm.func @wmmald_matrix(%arg0: !llvm.ptr<3>) {
   // expected-error@+1 {{'nvvm.ldmatrix' op expected destination type is i32}}
-  %l = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32)>
+  %l = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> !llvm.struct<(i32)>
   llvm.return
 }
 
 // -----
 
-llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32, 3>) {
+llvm.func @wmmald_matrix(%arg0: !llvm.ptr<3>) {
   // expected-error@+1 {{'nvvm.ldmatrix' op expected destination type is a structure of 4 elements of type i32}}
-  %l = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
+  %l = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
   llvm.return
 }
 
@@ -1278,33 +1231,33 @@ func.func @bitcast(%arg0: vector<2x3xf32>) {
 
 // -----
 
-func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
+func.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
   // expected-error @below {{expected byte size to be either 4, 8 or 16.}}
-  nvvm.cp.async.shared.global %arg0, %arg1, 32 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  nvvm.cp.async.shared.global %arg0, %arg1, 32 : !llvm.ptr<3>, !llvm.ptr<1>
   return
 }
 
 // -----
 
-func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
+func.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
   // expected-error @below {{bypass l1 is only support for 16 bytes copy.}}
-  nvvm.cp.async.shared.global %arg0, %arg1, 8 {bypass_l1} : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  nvvm.cp.async.shared.global %arg0, %arg1, 8 {bypass_l1} : !llvm.ptr<3>, !llvm.ptr<1>
   return
 }
 
 // -----
 
-func.func @gep_struct_variable(%arg0: !llvm.ptr<struct<(i32)>>, %arg1: i32, %arg2: i32) {
+func.func @gep_struct_variable(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32) {
   // expected-error @below {{op expected index 1 indexing a struct to be constant}}
-  llvm.getelementptr %arg0[%arg1, %arg1] : (!llvm.ptr<struct<(i32)>>, i32, i32) -> !llvm.ptr<i32>
+  llvm.getelementptr %arg0[%arg1, %arg1] : (!llvm.ptr, i32, i32) -> !llvm.ptr, !llvm.struct<(i32)>
   return
 }
 
 // -----
 
-func.func @gep_out_of_bounds(%ptr: !llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, %idx: i64) {
+func.func @gep_out_of_bounds(%ptr: !llvm.ptr, %idx: i64) {
   // expected-error @below {{index 2 indexing a struct is out of bounds}}
-  llvm.getelementptr %ptr[%idx, 1, 3] : (!llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, i64) -> !llvm.ptr<i32>
+  llvm.getelementptr %ptr[%idx, 1, 3] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(i32, struct<(i32, f32)>)>
   return
 }
 
@@ -1321,8 +1274,8 @@ func.func @non_splat_shuffle_on_scalable_vector(%arg0: vector<[4]xf32>) {
 llvm.mlir.global internal @side_effecting_global() : !llvm.struct<(i8)> {
   %0 = llvm.mlir.constant(1 : i64) : i64
   // expected-error@below {{ops with side effects not allowed in global initializers}}
-  %1 = llvm.alloca %0 x !llvm.struct<(i8)> : (i64) -> !llvm.ptr<struct<(i8)>>
-  %2 = llvm.load %1 : !llvm.ptr<struct<(i8)>>
+  %1 = llvm.alloca %0 x !llvm.struct<(i8)> : (i64) -> !llvm.ptr
+  %2 = llvm.load %1 : !llvm.ptr -> !llvm.struct<(i8)>
   llvm.return %2 : !llvm.struct<(i8)>
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/layout-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/layout-typed-pointers.mlir
new file mode 100644
index 0000000000000..5cf1ed03e64c8
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/layout-typed-pointers.mlir
@@ -0,0 +1,145 @@
+// RUN: mlir-opt --test-data-layout-query --split-input-file --verify-diagnostics %s | FileCheck %s
+
+module {
+  // CHECK: @no_spec
+  func.func @no_spec() {
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 0
+    // CHECK: bitsize = 64
+    // CHECK: preferred = 8
+    // CHECK: size = 8
+    "test.data_layout_query"() : () -> !llvm.ptr<i8>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 0
+    // CHECK: bitsize = 64
+    // CHECK: preferred = 8
+    // CHECK: size = 8
+    "test.data_layout_query"() : () -> !llvm.ptr<i32>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 0
+    // CHECK: bitsize = 64
+    // CHECK: preferred = 8
+    // CHECK: size = 8
+    "test.data_layout_query"() : () -> !llvm.ptr<bf16>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 0
+    // CHECK: bitsize = 64
+    // CHECK: preferred = 8
+    // CHECK: size = 8
+    "test.data_layout_query"() : () -> !llvm.ptr<!llvm.ptr<i8>>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 0
+    // CHECK: bitsize = 64
+    // CHECK: preferred = 8
+    // CHECK: size = 8
+    "test.data_layout_query"() : () -> !llvm.ptr<i8, 3>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 0
+    // CHECK: bitsize = 64
+    // CHECK: preferred = 8
+    // CHECK: size = 8
+    "test.data_layout_query"() : () -> !llvm.ptr<i8, 5>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 0
+    // CHECK: bitsize = 64
+    // CHECK: preferred = 8
+    // CHECK: size = 8
+    "test.data_layout_query"() : () -> !llvm.ptr<5>
+    return
+  }
+}
+
+// -----
+
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!llvm.ptr<i8>, dense<[32, 32, 64]> : vector<3xi32>>,
+  #dlti.dl_entry<!llvm.ptr<i8, 5>, dense<[64, 64, 64]> : vector<3xi32>>,
+  #dlti.dl_entry<!llvm.ptr<4>, dense<[32, 64, 64]> : vector<3xi32>>,
+  #dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>
+>} {
+  // CHECK: @spec
+  func.func @spec() {
+    // CHECK: alignment = 4
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: preferred = 8
+    // CHECK: size = 4
+    "test.data_layout_query"() : () -> !llvm.ptr<i8>
+    // CHECK: alignment = 4
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: preferred = 8
+    // CHECK: size = 4
+    "test.data_layout_query"() : () -> !llvm.ptr<i32>
+    // CHECK: alignment = 4
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: preferred = 8
+    // CHECK: size = 4
+    "test.data_layout_query"() : () -> !llvm.ptr<bf16>
+    // CHECK: alignment = 4
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: preferred = 8
+    // CHECK: size = 4
+    "test.data_layout_query"() : () -> !llvm.ptr<!llvm.ptr<i8>>
+    // CHECK: alignment = 4
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: preferred = 8
+    // CHECK: size = 4
+    "test.data_layout_query"() : () -> !llvm.ptr<i8, 3>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 64
+    // CHECK: preferred = 8
+    // CHECK: size = 8
+    "test.data_layout_query"() : () -> !llvm.ptr<i8, 5>
+    // CHECK: alignment = 4
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: preferred = 8
+    // CHECK: size = 4
+    "test.data_layout_query"() : () -> !llvm.ptr<3>
+    // CHECK: alignment = 8
+    // CHECK: alloca_memory_space = 5
+    // CHECK: bitsize = 32
+    // CHECK: preferred = 8
+    // CHECK: size = 4
+	"test.data_layout_query"() : () -> !llvm.ptr<4>
+    return
+  }
+}
+
+// -----
+
+// expected-error@below {{unexpected layout attribute for pointer to 'i32'}}
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!llvm.ptr<i32>, dense<[64, 64, 64]> : vector<3xi32>>
+>} {
+  func.func @pointer() {
+    return
+  }
+}
+
+// -----
+
+// expected-error@below {{expected layout attribute for '!llvm.ptr<i8>' to be a dense integer elements attribute with 3 or 4 elements}}
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!llvm.ptr<i8>, dense<[64.0, 64.0, 64.0]> : vector<3xf32>>
+>} {
+  func.func @pointer() {
+    return
+  }
+}
+
+// -----
+
+// expected-error@below {{preferred alignment is expected to be at least as large as ABI alignment}}
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!llvm.ptr<i8>, dense<[64, 64, 32]> : vector<3xi32>>
+>} {
+  func.func @pointer() {
+    return
+  }
+}
diff --git a/mlir/test/Dialect/LLVMIR/layout.mlir b/mlir/test/Dialect/LLVMIR/layout.mlir
index e5c8c0bd86db1..d6e2013cc86ca 100644
--- a/mlir/test/Dialect/LLVMIR/layout.mlir
+++ b/mlir/test/Dialect/LLVMIR/layout.mlir
@@ -3,42 +3,13 @@
 module {
   // CHECK: @no_spec
   func.func @no_spec() {
+    "test.data_layout_query"() : () -> !llvm.ptr
     // CHECK: alignment = 8
     // CHECK: alloca_memory_space = 0
     // CHECK: bitsize = 64
     // CHECK: preferred = 8
     // CHECK: size = 8
-    "test.data_layout_query"() : () -> !llvm.ptr<i8>
-    // CHECK: alignment = 8
-    // CHECK: alloca_memory_space = 0
-    // CHECK: bitsize = 64
-    // CHECK: preferred = 8
-    // CHECK: size = 8
-    "test.data_layout_query"() : () -> !llvm.ptr<i32>
-    // CHECK: alignment = 8
-    // CHECK: alloca_memory_space = 0
-    // CHECK: bitsize = 64
-    // CHECK: preferred = 8
-    // CHECK: size = 8
-    "test.data_layout_query"() : () -> !llvm.ptr<bf16>
-    // CHECK: alignment = 8
-    // CHECK: alloca_memory_space = 0
-    // CHECK: bitsize = 64
-    // CHECK: preferred = 8
-    // CHECK: size = 8
-    "test.data_layout_query"() : () -> !llvm.ptr<!llvm.ptr<i8>>
-    // CHECK: alignment = 8
-    // CHECK: alloca_memory_space = 0
-    // CHECK: bitsize = 64
-    // CHECK: preferred = 8
-    // CHECK: size = 8
-    "test.data_layout_query"() : () -> !llvm.ptr<i8, 3>
-    // CHECK: alignment = 8
-    // CHECK: alloca_memory_space = 0
-    // CHECK: bitsize = 64
-    // CHECK: preferred = 8
-    // CHECK: size = 8
-    "test.data_layout_query"() : () -> !llvm.ptr<i8, 5>
+    "test.data_layout_query"() : () -> !llvm.ptr<3>
     // CHECK: alignment = 8
     // CHECK: alloca_memory_space = 0
     // CHECK: bitsize = 64
@@ -52,8 +23,8 @@ module {
 // -----
 
 module attributes { dlti.dl_spec = #dlti.dl_spec<
-  #dlti.dl_entry<!llvm.ptr<i8>, dense<[32, 32, 64]> : vector<3xi32>>,
-  #dlti.dl_entry<!llvm.ptr<i8, 5>, dense<[64, 64, 64]> : vector<3xi32>>,
+  #dlti.dl_entry<!llvm.ptr, dense<[32, 32, 64]> : vector<3xi32>>,
+  #dlti.dl_entry<!llvm.ptr<5>, dense<[64, 64, 64]> : vector<3xi32>>,
   #dlti.dl_entry<!llvm.ptr<4>, dense<[32, 64, 64]> : vector<3xi32>>,
   #dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>
 >} {
@@ -64,37 +35,19 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
     // CHECK: bitsize = 32
     // CHECK: preferred = 8
     // CHECK: size = 4
-    "test.data_layout_query"() : () -> !llvm.ptr<i8>
+    "test.data_layout_query"() : () -> !llvm.ptr
     // CHECK: alignment = 4
     // CHECK: alloca_memory_space = 5
     // CHECK: bitsize = 32
     // CHECK: preferred = 8
     // CHECK: size = 4
-    "test.data_layout_query"() : () -> !llvm.ptr<i32>
-    // CHECK: alignment = 4
-    // CHECK: alloca_memory_space = 5
-    // CHECK: bitsize = 32
-    // CHECK: preferred = 8
-    // CHECK: size = 4
-    "test.data_layout_query"() : () -> !llvm.ptr<bf16>
-    // CHECK: alignment = 4
-    // CHECK: alloca_memory_space = 5
-    // CHECK: bitsize = 32
-    // CHECK: preferred = 8
-    // CHECK: size = 4
-    "test.data_layout_query"() : () -> !llvm.ptr<!llvm.ptr<i8>>
-    // CHECK: alignment = 4
-    // CHECK: alloca_memory_space = 5
-    // CHECK: bitsize = 32
-    // CHECK: preferred = 8
-    // CHECK: size = 4
-    "test.data_layout_query"() : () -> !llvm.ptr<i8, 3>
+    "test.data_layout_query"() : () -> !llvm.ptr<3>
     // CHECK: alignment = 8
     // CHECK: alloca_memory_space = 5
     // CHECK: bitsize = 64
     // CHECK: preferred = 8
     // CHECK: size = 8
-    "test.data_layout_query"() : () -> !llvm.ptr<i8, 5>
+    "test.data_layout_query"() : () -> !llvm.ptr<5>
     // CHECK: alignment = 4
     // CHECK: alloca_memory_space = 5
     // CHECK: bitsize = 32
@@ -113,20 +66,9 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
 
 // -----
 
-// expected-error@below {{unexpected layout attribute for pointer to 'i32'}}
-module attributes { dlti.dl_spec = #dlti.dl_spec<
-  #dlti.dl_entry<!llvm.ptr<i32>, dense<[64, 64, 64]> : vector<3xi32>>
->} {
-  func.func @pointer() {
-    return
-  }
-}
-
-// -----
-
-// expected-error@below {{expected layout attribute for '!llvm.ptr<i8>' to be a dense integer elements attribute with 3 or 4 elements}}
+// expected-error@below {{expected layout attribute for '!llvm.ptr' to be a dense integer elements attribute with 3 or 4 elements}}
 module attributes { dlti.dl_spec = #dlti.dl_spec<
-  #dlti.dl_entry<!llvm.ptr<i8>, dense<[64.0, 64.0, 64.0]> : vector<3xf32>>
+  #dlti.dl_entry<!llvm.ptr, dense<[64.0, 64.0, 64.0]> : vector<3xf32>>
 >} {
   func.func @pointer() {
     return
@@ -137,7 +79,7 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
 
 // expected-error@below {{preferred alignment is expected to be at least as large as ABI alignment}}
 module attributes { dlti.dl_spec = #dlti.dl_spec<
-  #dlti.dl_entry<!llvm.ptr<i8>, dense<[64, 64, 32]> : vector<3xi32>>
+  #dlti.dl_entry<!llvm.ptr, dense<[64, 64, 32]> : vector<3xi32>>
 >} {
   func.func @pointer() {
     return
diff --git a/mlir/test/Dialect/LLVMIR/nvvm-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/nvvm-typed-pointers.mlir
new file mode 100644
index 0000000000000..5fbadd1dc414e
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm-typed-pointers.mlir
@@ -0,0 +1,55 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @nvvm_wmma_load_tf32
+func.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
+  // CHECK: nvvm.wmma.load {{.*}} {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
+    : (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>
+  llvm.return %0 : !llvm.struct<(i32, i32, i32, i32)>
+}
+
+// CHECK-LABEL: @cp_async
+llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
+// CHECK:  nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16
+  nvvm.cp.async.shared.global %arg0, %arg1, 16 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+// CHECK:  nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16 {bypass_l1}
+  nvvm.cp.async.shared.global %arg0, %arg1, 16 {bypass_l1} : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+// CHECK: nvvm.cp.async.commit.group
+  nvvm.cp.async.commit.group
+// CHECK: nvvm.cp.async.wait.group 0
+  nvvm.cp.async.wait.group 0
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @ld_matrix
+llvm.func @ld_matrix(%arg0: !llvm.ptr<i32, 3>) {
+  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 1 : i32} : (!llvm.ptr<i32, 3>) -> i32
+  %l1 = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> i32
+  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 2 : i32} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
+  %l2 = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
+  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 4 : i32} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32, i32, i32)>
+  %l4 = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32, i32, i32)>
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @redux_sync
+llvm.func @redux_sync(%value : i32, %offset : i32) -> i32 {
+  // CHECK: nvvm.redux.sync  add %{{.*}}
+  %r1 = nvvm.redux.sync add %value, %offset : i32 -> i32
+  // CHECK: nvvm.redux.sync  max %{{.*}}
+  %r2 = nvvm.redux.sync max %value, %offset : i32 -> i32
+  // CHECK: nvvm.redux.sync  min %{{.*}}
+  %r3 = nvvm.redux.sync min %value, %offset : i32 -> i32
+  // CHECK: nvvm.redux.sync  umax %{{.*}}
+  %r5 = nvvm.redux.sync umax %value, %offset : i32 -> i32
+  // CHECK: nvvm.redux.sync  umin %{{.*}}
+  %r6 = nvvm.redux.sync umin %value, %offset : i32 -> i32
+  // CHECK: nvvm.redux.sync  and %{{.*}}
+  %r7 = nvvm.redux.sync and %value, %offset : i32 -> i32
+  // CHECK: nvvm.redux.sync  or %{{.*}}
+  %r8 = nvvm.redux.sync or %value, %offset : i32 -> i32
+  // CHECK: nvvm.redux.sync  xor %{{.*}}
+  %r9 = nvvm.redux.sync xor %value, %offset : i32 -> i32
+  llvm.return %r1 : i32
+}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 6596b8503d7a5..c7c83d29638c4 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -266,11 +266,11 @@ func.func @nvvm_mma_m16n8k32_s4_s4(%a0 : i32, %a1 : i32,
 }
 
 // CHECK-LABEL: @nvvm_wmma_load_tf32
-func.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
+func.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr, %arg1 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
   // CHECK: nvvm.wmma.load {{.*}} {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
   %0 = nvvm.wmma.load %arg0, %arg1
     {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
-    : (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>
+    : (!llvm.ptr) -> !llvm.struct<(i32, i32, i32, i32)>
   llvm.return %0 : !llvm.struct<(i32, i32, i32, i32)>
 }
 
@@ -288,11 +288,11 @@ func.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 :
 }
 
 // CHECK-LABEL: @cp_async
-llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
+llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
 // CHECK:  nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16
-  nvvm.cp.async.shared.global %arg0, %arg1, 16 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  nvvm.cp.async.shared.global %arg0, %arg1, 16 : !llvm.ptr<3>, !llvm.ptr<1>
 // CHECK:  nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16 {bypass_l1}
-  nvvm.cp.async.shared.global %arg0, %arg1, 16 {bypass_l1} : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  nvvm.cp.async.shared.global %arg0, %arg1, 16 {bypass_l1} : !llvm.ptr<3>, !llvm.ptr<1>
 // CHECK: nvvm.cp.async.commit.group
   nvvm.cp.async.commit.group
 // CHECK: nvvm.cp.async.wait.group 0
@@ -301,18 +301,18 @@ llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
 }
 
 // CHECK-LABEL: llvm.func @ld_matrix
-llvm.func @ld_matrix(%arg0: !llvm.ptr<i32, 3>) {
-  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 1 : i32} : (!llvm.ptr<i32, 3>) -> i32
-  %l1 = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> i32
-  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 2 : i32} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
-  %l2 = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
-  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 4 : i32} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32, i32, i32)>
-  %l4 = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32, i32, i32)>
+llvm.func @ld_matrix(%arg0: !llvm.ptr<3>) {
+  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 1 : i32} : (!llvm.ptr<3>) -> i32
+  %l1 = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> i32
+  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 2 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
+  %l2 = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
+  // CHECK: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 4 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
+  %l4 = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
   llvm.return
 }
 
 // CHECK-LABEL: llvm.func @redux_sync
-llvm.func @redux_sync(%value : i32, %offset : i32) -> i32 {  
+llvm.func @redux_sync(%value : i32, %offset : i32) -> i32 {
   // CHECK: nvvm.redux.sync  add %{{.*}}
   %r1 = nvvm.redux.sync add %value, %offset : i32 -> i32
   // CHECK: nvvm.redux.sync  max %{{.*}}
@@ -324,9 +324,9 @@ llvm.func @redux_sync(%value : i32, %offset : i32) -> i32 {
   // CHECK: nvvm.redux.sync  umin %{{.*}}
   %r6 = nvvm.redux.sync umin %value, %offset : i32 -> i32
   // CHECK: nvvm.redux.sync  and %{{.*}}
-  %r7 = nvvm.redux.sync and %value, %offset : i32 -> i32  
+  %r7 = nvvm.redux.sync and %value, %offset : i32 -> i32
   // CHECK: nvvm.redux.sync  or %{{.*}}
-  %r8 = nvvm.redux.sync or %value, %offset : i32 -> i32  
+  %r8 = nvvm.redux.sync or %value, %offset : i32 -> i32
   // CHECK: nvvm.redux.sync  xor %{{.*}}
   %r9 = nvvm.redux.sync xor %value, %offset : i32 -> i32
   llvm.return %r1 : i32
diff --git a/mlir/test/Dialect/LLVMIR/parameter-attrs-invalid-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/parameter-attrs-invalid-typed-pointers.mlir
new file mode 100644
index 0000000000000..65411ff41e285
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/parameter-attrs-invalid-typed-pointers.mlir
@@ -0,0 +1,6 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// Argument attributes
+
+// expected-error@below {{"llvm.sret" attribute attached to LLVM pointer argument of different type}}
+llvm.func @invalid_sret_attr_type(%0 : !llvm.ptr<f32> {llvm.sret = !llvm.struct<(i32)>})
diff --git a/mlir/test/Dialect/LLVMIR/parameter-attrs-invalid.mlir b/mlir/test/Dialect/LLVMIR/parameter-attrs-invalid.mlir
index 72bf45052ef13..d7ee6097b3600 100644
--- a/mlir/test/Dialect/LLVMIR/parameter-attrs-invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/parameter-attrs-invalid.mlir
@@ -47,11 +47,6 @@ llvm.func @invalid_sret_arg_type(%0 : i32 {llvm.sret = !llvm.struct<(i32)>})
 
 // -----
 
-// expected-error@below {{"llvm.sret" attribute attached to LLVM pointer argument of different type}}
-llvm.func @invalid_sret_attr_type(%0 : !llvm.ptr<f32> {llvm.sret = !llvm.struct<(i32)>})
-
-// -----
-
 // expected-error@below {{"llvm.byval" attribute attached to non-pointer LLVM type}}
 llvm.func @invalid_byval_arg_type(%0 : i32 {llvm.byval = !llvm.struct<(i32)>})
 
diff --git a/mlir/test/Dialect/LLVMIR/types-invalid-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/types-invalid-typed-pointers.mlir
new file mode 100644
index 0000000000000..475fadede8fbf
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/types-invalid-typed-pointers.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt --allow-unregistered-dialect -split-input-file -verify-diagnostics %s
+
+func.func @void_pointer() {
+  // expected-error @+1 {{invalid pointer element type}}
+  "some.op"() : () -> !llvm.ptr<void>
+}
+
+// -----
+
+func.func @repeated_struct_name() {
+  "some.op"() : () -> !llvm.struct<"a", (ptr<struct<"a">>)>
+  // expected-error @+1 {{identified type already used with a different body}}
+  "some.op"() : () -> !llvm.struct<"a", (i32)>
+}
+
+// -----
+
+func.func @dynamic_vector() {
+  // expected-error @+1 {{expected '? x <integer> x <type>' or '<integer> x <type>'}}
+  "some.op"() : () -> !llvm.vec<? x ptr<f32>>
+}
+
+// -----
+
+func.func @dynamic_scalable_vector() {
+  // expected-error @+1 {{expected '? x <integer> x <type>' or '<integer> x <type>'}}
+  "some.op"() : () -> !llvm.vec<?x? x ptr<f32>>
+}
+
+// -----
+
+func.func @unscalable_vector() {
+  // expected-error @+1 {{expected '? x <integer> x <type>' or '<integer> x <type>'}}
+  "some.op"() : () -> !llvm.vec<4x4 x ptr<i32>>
+}
+
+// -----
+
+func.func @zero_vector() {
+  // expected-error @+1 {{the number of vector elements must be positive}}
+  "some.op"() : () -> !llvm.vec<0 x ptr<i32>>
+}
diff --git a/mlir/test/Dialect/LLVMIR/types-invalid.mlir b/mlir/test/Dialect/LLVMIR/types-invalid.mlir
index d8ac523b86d93..fce100e6a865c 100644
--- a/mlir/test/Dialect/LLVMIR/types-invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/types-invalid.mlir
@@ -21,15 +21,8 @@ func.func @function_taking_function() {
 
 // -----
 
-func.func @void_pointer() {
-  // expected-error @+1 {{invalid pointer element type}}
-  "some.op"() : () -> !llvm.ptr<void>
-}
-
-// -----
-
 func.func @repeated_struct_name() {
-  "some.op"() : () -> !llvm.struct<"a", (ptr<struct<"a">>)>
+  "some.op"() : () -> !llvm.struct<"a", (ptr)>
   // expected-error @+1 {{identified type already used with a different body}}
   "some.op"() : () -> !llvm.struct<"a", (i32)>
 }
@@ -113,28 +106,28 @@ func.func @identified_struct_with_void() {
 
 func.func @dynamic_vector() {
   // expected-error @+1 {{expected '? x <integer> x <type>' or '<integer> x <type>'}}
-  "some.op"() : () -> !llvm.vec<? x ptr<f32>>
+  "some.op"() : () -> !llvm.vec<? x ptr>
 }
 
 // -----
 
 func.func @dynamic_scalable_vector() {
   // expected-error @+1 {{expected '? x <integer> x <type>' or '<integer> x <type>'}}
-  "some.op"() : () -> !llvm.vec<?x? x ptr<f32>>
+  "some.op"() : () -> !llvm.vec<?x? x ptr>
 }
 
 // -----
 
 func.func @unscalable_vector() {
   // expected-error @+1 {{expected '? x <integer> x <type>' or '<integer> x <type>'}}
-  "some.op"() : () -> !llvm.vec<4x4 x ptr<i32>>
+  "some.op"() : () -> !llvm.vec<4x4 x ptr>
 }
 
 // -----
 
 func.func @zero_vector() {
   // expected-error @+1 {{the number of vector elements must be positive}}
-  "some.op"() : () -> !llvm.vec<0 x ptr<i32>>
+  "some.op"() : () -> !llvm.vec<0 x ptr>
 }
 
 // -----
diff --git a/mlir/test/Dialect/LLVMIR/types-typed-pointers.mlir b/mlir/test/Dialect/LLVMIR/types-typed-pointers.mlir
new file mode 100644
index 0000000000000..2d63f379c2ee7
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/types-typed-pointers.mlir
@@ -0,0 +1,118 @@
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file | mlir-opt -allow-unregistered-dialect | FileCheck %s
+
+// CHECK-LABEL: @ptr
+func.func @ptr() {
+  // CHECK: !llvm.ptr<i8>
+  "some.op"() : () -> !llvm.ptr<i8>
+  // CHECK: !llvm.ptr<f32>
+  "some.op"() : () -> !llvm.ptr<f32>
+  // CHECK: !llvm.ptr<ptr<i8>>
+  "some.op"() : () -> !llvm.ptr<ptr<i8>>
+  // CHECK: !llvm.ptr<ptr<ptr<ptr<ptr<i8>>>>>
+  "some.op"() : () -> !llvm.ptr<ptr<ptr<ptr<ptr<i8>>>>>
+  // CHECK: !llvm.ptr<i8>
+  "some.op"() : () -> !llvm.ptr<i8, 0>
+  // CHECK: !llvm.ptr<i8, 1>
+  "some.op"() : () -> !llvm.ptr<i8, 1>
+  // CHECK: !llvm.ptr<i8, 42>
+  "some.op"() : () -> !llvm.ptr<i8, 42>
+  // CHECK: !llvm.ptr<ptr<i8, 42>, 9>
+  "some.op"() : () -> !llvm.ptr<ptr<i8, 42>, 9>
+  // CHECK: !llvm.ptr
+  "some.op"() : () -> !llvm.ptr
+  // CHECK: !llvm.ptr<42>
+  "some.op"() : () -> !llvm.ptr<42>
+  return
+}
+
+// CHECK-LABEL: @vec
+func.func @vec() {
+  // CHECK: vector<4xi32>
+  "some.op"() : () -> vector<4xi32>
+  // CHECK: vector<4xf32>
+  "some.op"() : () -> vector<4xf32>
+  // CHECK: !llvm.vec<? x 4 x i32>
+  "some.op"() : () -> !llvm.vec<? x 4 x i32>
+  // CHECK: !llvm.vec<? x 8 x f16>
+  "some.op"() : () -> !llvm.vec<? x 8 x f16>
+  // CHECK: !llvm.vec<4 x ptr<i8>>
+  "some.op"() : () -> !llvm.vec<4 x ptr<i8>>
+  return
+}
+
+// CHECK-LABEL: @array
+func.func @array() {
+  // CHECK: !llvm.array<10 x i32>
+  "some.op"() : () -> !llvm.array<10 x i32>
+  // CHECK: !llvm.array<8 x f32>
+  "some.op"() : () -> !llvm.array<8 x f32>
+  // CHECK: !llvm.array<10 x ptr<i32, 4>>
+  "some.op"() : () -> !llvm.array<10 x ptr<i32, 4>>
+  // CHECK: !llvm.array<10 x array<4 x f32>>
+  "some.op"() : () -> !llvm.array<10 x array<4 x f32>>
+  return
+}
+
+// CHECK-LABEL: @identified_struct
+func.func @identified_struct() {
+  // CHECK: !llvm.struct<"empty", ()>
+  "some.op"() : () -> !llvm.struct<"empty", ()>
+  // CHECK: !llvm.struct<"opaque", opaque>
+  "some.op"() : () -> !llvm.struct<"opaque", opaque>
+  // CHECK: !llvm.struct<"long", (i32, struct<(i32, i1)>, f32, ptr<func<void ()>>)>
+  "some.op"() : () -> !llvm.struct<"long", (i32, struct<(i32, i1)>, f32, ptr<func<void ()>>)>
+  // CHECK: !llvm.struct<"self-recursive", (ptr<struct<"self-recursive">>)>
+  "some.op"() : () -> !llvm.struct<"self-recursive", (ptr<struct<"self-recursive">>)>
+  // CHECK: !llvm.struct<"unpacked", (i32)>
+  "some.op"() : () -> !llvm.struct<"unpacked", (i32)>
+  // CHECK: !llvm.struct<"packed", packed (i32)>
+  "some.op"() : () -> !llvm.struct<"packed", packed (i32)>
+  // CHECK: !llvm.struct<"name with spaces and !^$@$#", packed (i32)>
+  "some.op"() : () -> !llvm.struct<"name with spaces and !^$@$#", packed (i32)>
+
+  // CHECK: !llvm.struct<"mutually-a", (ptr<struct<"mutually-b", (ptr<struct<"mutually-a">, 3>)>>)>
+  "some.op"() : () -> !llvm.struct<"mutually-a", (ptr<struct<"mutually-b", (ptr<struct<"mutually-a">, 3>)>>)>
+  // CHECK: !llvm.struct<"mutually-b", (ptr<struct<"mutually-a", (ptr<struct<"mutually-b">>)>, 3>)>
+  "some.op"() : () -> !llvm.struct<"mutually-b", (ptr<struct<"mutually-a", (ptr<struct<"mutually-b">>)>, 3>)>
+  // CHECK: !llvm.struct<"referring-another", (ptr<struct<"unpacked", (i32)>>)>
+  "some.op"() : () -> !llvm.struct<"referring-another", (ptr<struct<"unpacked", (i32)>>)>
+
+  // CHECK: !llvm.struct<"struct-of-arrays", (array<10 x i32>)>
+  "some.op"() : () -> !llvm.struct<"struct-of-arrays", (array<10 x i32>)>
+  // CHECK: !llvm.array<10 x struct<"array-of-structs", (i32)>>
+  "some.op"() : () -> !llvm.array<10 x struct<"array-of-structs", (i32)>>
+  // CHECK: !llvm.ptr<struct<"ptr-to-struct", (i8)>>
+  "some.op"() : () -> !llvm.ptr<struct<"ptr-to-struct", (i8)>>
+  return
+}
+
+// CHECK-LABEL: @ptr_elem_interface
+// CHECK-COUNT-3: !llvm.ptr<!test.smpla>
+// CHECK: llvm.mlir.undef : !llvm.ptr<!test.smpla>
+func.func @ptr_elem_interface(%arg0: !llvm.ptr<!test.smpla>) {
+  %0 = llvm.load %arg0 : !llvm.ptr<!test.smpla>
+  llvm.store %0, %arg0 : !llvm.ptr<!test.smpla>
+  llvm.mlir.undef : !llvm.ptr<!test.smpla>
+  return
+}
+
+// -----
+
+// Check that type aliases can be used inside LLVM dialect types. Note that
+// currently they are _not_ printed back as this would require
+// DialectAsmPrinter to have a mechanism for querying the presence and
+// usability of an alias outside of its `printType` method.
+
+!baz = i64
+!qux = !llvm.struct<(!baz)>
+
+!rec = !llvm.struct<"a", (ptr<struct<"a">>)>
+
+// CHECK: aliases
+llvm.func @aliases() {
+  // CHECK: !llvm.struct<(i32, f32, struct<(i64)>)>
+  "some.op"() : () -> !llvm.struct<(i32, f32, !qux)>
+  // CHECK: !llvm.struct<"a", (ptr<struct<"a">>)>
+  "some.op"() : () -> !rec
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/types.mlir b/mlir/test/Dialect/LLVMIR/types.mlir
index 54c44a6aa58ab..42352ce697f02 100644
--- a/mlir/test/Dialect/LLVMIR/types.mlir
+++ b/mlir/test/Dialect/LLVMIR/types.mlir
@@ -57,26 +57,14 @@ func.func @integer() {
 
 // CHECK-LABEL: @ptr
 func.func @ptr() {
-  // CHECK: !llvm.ptr<i8>
-  "some.op"() : () -> !llvm.ptr<i8>
-  // CHECK: !llvm.ptr<f32>
-  "some.op"() : () -> !llvm.ptr<f32>
-  // CHECK: !llvm.ptr<ptr<i8>>
-  "some.op"() : () -> !llvm.ptr<ptr<i8>>
-  // CHECK: !llvm.ptr<ptr<ptr<ptr<ptr<i8>>>>>
-  "some.op"() : () -> !llvm.ptr<ptr<ptr<ptr<ptr<i8>>>>>
-  // CHECK: !llvm.ptr<i8>
-  "some.op"() : () -> !llvm.ptr<i8, 0>
-  // CHECK: !llvm.ptr<i8, 1>
-  "some.op"() : () -> !llvm.ptr<i8, 1>
-  // CHECK: !llvm.ptr<i8, 42>
-  "some.op"() : () -> !llvm.ptr<i8, 42>
-  // CHECK: !llvm.ptr<ptr<i8, 42>, 9>
-  "some.op"() : () -> !llvm.ptr<ptr<i8, 42>, 9>
   // CHECK: !llvm.ptr
   "some.op"() : () -> !llvm.ptr
+  // CHECK: !llvm.ptr
+  "some.op"() : () -> !llvm.ptr<0>
   // CHECK: !llvm.ptr<42>
   "some.op"() : () -> !llvm.ptr<42>
+  // CHECK: !llvm.ptr<ptr<42>, 9>
+  "some.op"() : () -> !llvm.ptr<ptr<42>, 9>
   return
 }
 
@@ -90,8 +78,8 @@ func.func @vec() {
   "some.op"() : () -> !llvm.vec<? x 4 x i32>
   // CHECK: !llvm.vec<? x 8 x f16>
   "some.op"() : () -> !llvm.vec<? x 8 x f16>
-  // CHECK: !llvm.vec<4 x ptr<i8>>
-  "some.op"() : () -> !llvm.vec<4 x ptr<i8>>
+  // CHECK: !llvm.vec<4 x ptr>
+  "some.op"() : () -> !llvm.vec<4 x ptr>
   return
 }
 
@@ -101,8 +89,8 @@ func.func @array() {
   "some.op"() : () -> !llvm.array<10 x i32>
   // CHECK: !llvm.array<8 x f32>
   "some.op"() : () -> !llvm.array<8 x f32>
-  // CHECK: !llvm.array<10 x ptr<i32, 4>>
-  "some.op"() : () -> !llvm.array<10 x ptr<i32, 4>>
+  // CHECK: !llvm.array<10 x ptr<4>>
+  "some.op"() : () -> !llvm.array<10 x ptr<4>>
   // CHECK: !llvm.array<10 x array<4 x f32>>
   "some.op"() : () -> !llvm.array<10 x array<4 x f32>>
   return
@@ -147,30 +135,22 @@ func.func @identified_struct() {
   "some.op"() : () -> !llvm.struct<"empty", ()>
   // CHECK: !llvm.struct<"opaque", opaque>
   "some.op"() : () -> !llvm.struct<"opaque", opaque>
-  // CHECK: !llvm.struct<"long", (i32, struct<(i32, i1)>, f32, ptr<func<void ()>>)>
-  "some.op"() : () -> !llvm.struct<"long", (i32, struct<(i32, i1)>, f32, ptr<func<void ()>>)>
-  // CHECK: !llvm.struct<"self-recursive", (ptr<struct<"self-recursive">>)>
-  "some.op"() : () -> !llvm.struct<"self-recursive", (ptr<struct<"self-recursive">>)>
+  // CHECK: !llvm.struct<"long", (i32, struct<(i32, i1)>, f32, ptr)>
+  "some.op"() : () -> !llvm.struct<"long", (i32, struct<(i32, i1)>, f32, ptr)>
   // CHECK: !llvm.struct<"unpacked", (i32)>
   "some.op"() : () -> !llvm.struct<"unpacked", (i32)>
   // CHECK: !llvm.struct<"packed", packed (i32)>
   "some.op"() : () -> !llvm.struct<"packed", packed (i32)>
   // CHECK: !llvm.struct<"name with spaces and !^$@$#", packed (i32)>
   "some.op"() : () -> !llvm.struct<"name with spaces and !^$@$#", packed (i32)>
-
-  // CHECK: !llvm.struct<"mutually-a", (ptr<struct<"mutually-b", (ptr<struct<"mutually-a">, 3>)>>)>
-  "some.op"() : () -> !llvm.struct<"mutually-a", (ptr<struct<"mutually-b", (ptr<struct<"mutually-a">, 3>)>>)>
-  // CHECK: !llvm.struct<"mutually-b", (ptr<struct<"mutually-a", (ptr<struct<"mutually-b">>)>, 3>)>
-  "some.op"() : () -> !llvm.struct<"mutually-b", (ptr<struct<"mutually-a", (ptr<struct<"mutually-b">>)>, 3>)>
-  // CHECK: !llvm.struct<"referring-another", (ptr<struct<"unpacked", (i32)>>)>
-  "some.op"() : () -> !llvm.struct<"referring-another", (ptr<struct<"unpacked", (i32)>>)>
-
+  // CHECK: !llvm.struct<"outer", (struct<"nested", ()>)>
+  "some.op"() : () -> !llvm.struct<"outer", (struct<"nested", ()>)>
+  // CHECK: !llvm.struct<"referring-another", (ptr)>
+  "some.op"() : () -> !llvm.struct<"referring-another", (ptr)>
   // CHECK: !llvm.struct<"struct-of-arrays", (array<10 x i32>)>
   "some.op"() : () -> !llvm.struct<"struct-of-arrays", (array<10 x i32>)>
   // CHECK: !llvm.array<10 x struct<"array-of-structs", (i32)>>
   "some.op"() : () -> !llvm.array<10 x struct<"array-of-structs", (i32)>>
-  // CHECK: !llvm.ptr<struct<"ptr-to-struct", (i8)>>
-  "some.op"() : () -> !llvm.ptr<struct<"ptr-to-struct", (i8)>>
   return
 }
 
@@ -180,16 +160,6 @@ func.func @verbose() {
   return
 }
 
-// CHECK-LABEL: @ptr_elem_interface
-// CHECK-COUNT-3: !llvm.ptr<!test.smpla>
-// CHECK: llvm.mlir.undef : !llvm.ptr<!test.smpla>
-func.func @ptr_elem_interface(%arg0: !llvm.ptr<!test.smpla>) {
-  %0 = llvm.load %arg0 : !llvm.ptr<!test.smpla>
-  llvm.store %0, %arg0 : !llvm.ptr<!test.smpla>
-  llvm.mlir.undef : !llvm.ptr<!test.smpla>
-  return
-}
-
 // -----
 
 // Check that type aliases can be used inside LLVM dialect types. Note that
@@ -200,13 +170,9 @@ func.func @ptr_elem_interface(%arg0: !llvm.ptr<!test.smpla>) {
 !baz = i64
 !qux = !llvm.struct<(!baz)>
 
-!rec = !llvm.struct<"a", (ptr<struct<"a">>)>
-
 // CHECK: aliases
 llvm.func @aliases() {
   // CHECK: !llvm.struct<(i32, f32, struct<(i64)>)>
   "some.op"() : () -> !llvm.struct<(i32, f32, !qux)>
-  // CHECK: !llvm.struct<"a", (ptr<struct<"a">>)>
-  "some.op"() : () -> !rec
   llvm.return
 }

From 3486f5f0b40114f50f0febd2e326d3ad4f696c66 Mon Sep 17 00:00:00 2001
From: Johannes de Fine Licht <johannes.definelicht@nextsilicon.com>
Date: Wed, 22 Mar 2023 16:02:40 +0100
Subject: [PATCH 208/208] [MLIR][LLVM] Add debug output to the LLVM inliner.

This revealed a test case that wasn't hitting the intended branch
because the inlinees had no function definition.

Depends on D146628

Differential Revision: https://reviews.llvm.org/D146633
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp | 61 +++++++++++++++++----
 mlir/test/Dialect/LLVMIR/inlining.mlir      | 29 ++++++++--
 2 files changed, 74 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
index 8a399b9a5d030..23dd22b9cbd03 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
@@ -15,6 +15,9 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "llvm-inliner"
 
 using namespace mlir;
 
@@ -134,9 +137,17 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
     if (!wouldBeCloned)
       return false;
     auto callOp = dyn_cast<LLVM::CallOp>(call);
+    if (!callOp) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Cannot inline: call is not an LLVM::CallOp\n");
+      return false;
+    }
     auto funcOp = dyn_cast<LLVM::LLVMFuncOp>(callable);
-    if (!callOp || !funcOp)
+    if (!funcOp) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Cannot inline: callable is not an LLVM::LLVMFuncOp\n");
       return false;
+    }
     if (auto attrs = funcOp.getArgAttrs()) {
       for (Attribute attr : *attrs) {
         auto attrDict = cast<DictionaryAttr>(attr);
@@ -144,16 +155,25 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
           if (attr.getName() == LLVM::LLVMDialect::getByValAttrName())
             continue;
           // TODO: Handle all argument attributes;
+          LLVM_DEBUG(llvm::dbgs() << "Cannot inline " << funcOp.getSymName()
+                                  << ": unhandled argument attribute \""
+                                  << attr.getName() << "\"\n");
           return false;
         }
       }
     }
     // TODO: Handle result attributes;
-    if (funcOp.getResAttrs())
+    if (funcOp.getResAttrs()) {
+      LLVM_DEBUG(llvm::dbgs() << "Cannot inline " << funcOp.getSymName()
+                              << ": unhandled result attribute\n");
       return false;
+    }
     // TODO: Handle exceptions.
-    if (funcOp.getPersonality())
+    if (funcOp.getPersonality()) {
+      LLVM_DEBUG(llvm::dbgs() << "Cannot inline " << funcOp.getSymName()
+                              << ": unhandled function personality\n");
       return false;
+    }
     if (funcOp.getPassthrough()) {
       // TODO: Used attributes should not be passthrough.
       DenseSet<StringAttr> disallowed(
@@ -167,7 +187,14 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
             auto stringAttr = dyn_cast<StringAttr>(attr);
             if (!stringAttr)
               return false;
-            return disallowed.contains(stringAttr);
+            if (disallowed.contains(stringAttr)) {
+              LLVM_DEBUG(llvm::dbgs()
+                         << "Cannot inline " << funcOp.getSymName()
+                         << ": found disallowed function attribute "
+                         << stringAttr << "\n");
+              return true;
+            }
+            return false;
           }))
         return false;
     }
@@ -185,14 +212,28 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
     // Some attributes on memory operations require handling during
     // inlining. Since this is not yet implemented, refuse to inline memory
     // operations that have any of these attributes.
-    if (auto iface = dyn_cast<LLVM::AliasAnalysisOpInterface>(op))
-      if (iface.getAliasScopesOrNull() || iface.getNoAliasScopesOrNull())
+    if (auto iface = dyn_cast<LLVM::AliasAnalysisOpInterface>(op)) {
+      if (iface.getAliasScopesOrNull() || iface.getNoAliasScopesOrNull()) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Cannot inline: unhandled alias analysis metadata\n");
         return false;
-    if (auto iface = dyn_cast<LLVM::AccessGroupOpInterface>(op))
-      if (iface.getAccessGroupsOrNull())
+      }
+    }
+    if (auto iface = dyn_cast<LLVM::AccessGroupOpInterface>(op)) {
+      if (iface.getAccessGroupsOrNull()) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Cannot inline: unhandled access group metadata\n");
         return false;
-    return isa<LLVM::CallOp, LLVM::AllocaOp, LLVM::LifetimeStartOp,
-               LLVM::LifetimeEndOp, LLVM::LoadOp, LLVM::StoreOp>(op);
+      }
+    }
+    if (!isa<LLVM::CallOp, LLVM::AllocaOp, LLVM::LifetimeStartOp,
+             LLVM::LifetimeEndOp, LLVM::LoadOp, LLVM::StoreOp>(op)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Cannot inline: unhandled side effecting operation \""
+                 << op->getName() << "\"\n");
+      return false;
+    }
+    return true;
   }
 
   /// Handle the given inlined return by replacing it with a branch. This
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index cefb8d5e461d4..e6dc047fd42b9 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -160,12 +160,29 @@ llvm.func @caller() {
 
 // -----
 
-llvm.func @callee_noinline() attributes { passthrough = ["noinline"] }
-llvm.func @callee_optnone() attributes { passthrough = ["optnone"] }
-llvm.func @callee_noduplicate() attributes { passthrough = ["noduplicate"] }
-llvm.func @callee_presplitcoroutine() attributes { passthrough = ["presplitcoroutine"] }
-llvm.func @callee_returns_twice() attributes { passthrough = ["returns_twice"] }
-llvm.func @callee_strictfp() attributes { passthrough = ["strictfp"] }
+llvm.func @callee_noinline() attributes { passthrough = ["noinline"] } {
+  llvm.return
+}
+
+llvm.func @callee_optnone() attributes { passthrough = ["optnone"] } {
+  llvm.return
+}
+
+llvm.func @callee_noduplicate() attributes { passthrough = ["noduplicate"] } {
+  llvm.return
+}
+
+llvm.func @callee_presplitcoroutine() attributes { passthrough = ["presplitcoroutine"] } {
+  llvm.return
+}
+
+llvm.func @callee_returns_twice() attributes { passthrough = ["returns_twice"] } {
+  llvm.return
+}
+
+llvm.func @callee_strictfp() attributes { passthrough = ["strictfp"] } {
+  llvm.return
+}
 
 // CHECK-LABEL: llvm.func @caller
 // CHECK-NEXT: llvm.call @callee_noinline